/home/yuqian_fu {'gpu': '0', 'data': 'art_painting', 'ntr': None, 'translate': None, 'autoaug': 'CA_multiple', 'n': 3, 'stride': 5, 'factor_num': 16, 'epochs': 30, 'nbatch': 100, 'batchsize': 6, 'lr': 0.01, 'lr_scheduler': 'cosine', 'svroot': '/data/work-gcp-europe-west4-a/yuqian_fu/datasets/SingleSourceDG/saved-PACS//art_painting/CA_multiple_16fa_v2_ep30_lr0.01_cosine_base0.01_bs6_lamCa_1_lamRe1_adt4_cls1_EW2_70_rmTrue_rnTrue_str5', 'clsadapt': True, 'lambda_causal': 1.0, 'lambda_re': 1.0, 'randm': True, 'randn': True, 'network': 'resnet18'} /data/work-gcp-europe-west4-a/yuqian_fu/datasets/SingleSourceDG/data/PACS/art_painting_train.hdf5 torch.Size([1840, 3, 227, 227]) torch.Size([1840]) --------------------------CA_multiple-------------------------- ---------------------------16 factors----------------- randm: True randn: True n: 3 randm: False /data/work-gcp-europe-west4-a/yuqian_fu/datasets/SingleSourceDG/data/PACS/art_painting_val.hdf5 torch.Size([208, 3, 227, 227]) torch.Size([208]) -------------------------------------loading pretrain weights---------------------------------- 306 cls_loss: tensor(2.3446, device='cuda:0', grad_fn=) cls_loss: tensor(4.6243, device='cuda:0', grad_fn=) cls_loss: tensor(7.0508, device='cuda:0', grad_fn=) cls_loss: tensor(3.5887, device='cuda:0', grad_fn=) cls_loss: tensor(4.5546, device='cuda:0', grad_fn=) cls_loss: tensor(17.9310, device='cuda:0', grad_fn=) cls_loss: tensor(9.2499, device='cuda:0', grad_fn=) cls_loss: tensor(36.5010, device='cuda:0', grad_fn=) cls_loss: tensor(35.8023, device='cuda:0', grad_fn=) cls_loss: tensor(19.3028, device='cuda:0', grad_fn=) cls_loss: tensor(8.4857, device='cuda:0', grad_fn=) cls_loss: tensor(19.2803, device='cuda:0', grad_fn=) cls_loss: tensor(17.4525, device='cuda:0', grad_fn=) cls_loss: tensor(15.1171, device='cuda:0', grad_fn=) cls_loss: tensor(9.3067, device='cuda:0', grad_fn=) cls_loss: tensor(20.6821, device='cuda:0', grad_fn=) cls_loss: tensor(11.4511, device='cuda:0', grad_fn=) cls_loss: tensor(6.7550, device='cuda:0', grad_fn=) cls_loss: tensor(10.6936, device='cuda:0', grad_fn=) cls_loss: tensor(12.9821, device='cuda:0', grad_fn=) cls_loss: tensor(19.3366, device='cuda:0', grad_fn=) cls_loss: tensor(4.8249, device='cuda:0', grad_fn=) cls_loss: tensor(17.5163, device='cuda:0', grad_fn=) cls_loss: tensor(2.3895, device='cuda:0', grad_fn=) cls_loss: tensor(1.4897, device='cuda:0', grad_fn=) cls_loss: tensor(6.5002, device='cuda:0', grad_fn=) cls_loss: tensor(3.1022, device='cuda:0', grad_fn=) cls_loss: tensor(2.3366, device='cuda:0', grad_fn=) cls_loss: tensor(6.6002, device='cuda:0', grad_fn=) cls_loss: tensor(8.7183, device='cuda:0', grad_fn=) cls_loss: tensor(7.2559, device='cuda:0', grad_fn=) cls_loss: tensor(4.3325, device='cuda:0', grad_fn=) cls_loss: tensor(1.5716, device='cuda:0', grad_fn=) cls_loss: tensor(3.8314, device='cuda:0', grad_fn=) cls_loss: tensor(4.0708, device='cuda:0', grad_fn=) cls_loss: tensor(7.6634, device='cuda:0', grad_fn=) cls_loss: tensor(1.0142, device='cuda:0', grad_fn=) cls_loss: tensor(6.6389, device='cuda:0', grad_fn=) cls_loss: tensor(5.0640, device='cuda:0', grad_fn=) cls_loss: tensor(3.8086, device='cuda:0', grad_fn=) cls_loss: tensor(2.3700, device='cuda:0', grad_fn=) cls_loss: tensor(1.3503, device='cuda:0', grad_fn=) cls_loss: tensor(3.0020, device='cuda:0', grad_fn=) cls_loss: tensor(5.1936, device='cuda:0', grad_fn=) cls_loss: tensor(7.1166, device='cuda:0', grad_fn=) cls_loss: tensor(14.0024, device='cuda:0', grad_fn=) cls_loss: tensor(11.7174, device='cuda:0', grad_fn=) cls_loss: tensor(5.0330, device='cuda:0', grad_fn=) cls_loss: tensor(3.5052, device='cuda:0', grad_fn=) cls_loss: tensor(0.6397, device='cuda:0', grad_fn=) cls_loss: tensor(6.0466, device='cuda:0', grad_fn=) cls_loss: tensor(7.8281, device='cuda:0', grad_fn=) cls_loss: tensor(2.2224, device='cuda:0', grad_fn=) cls_loss: tensor(8.0215, device='cuda:0', grad_fn=) cls_loss: tensor(22.5807, device='cuda:0', grad_fn=) cls_loss: tensor(3.2865, device='cuda:0', grad_fn=) cls_loss: tensor(0.5660, device='cuda:0', grad_fn=) cls_loss: tensor(7.1732, device='cuda:0', grad_fn=) cls_loss: tensor(3.5026, device='cuda:0', grad_fn=) cls_loss: tensor(7.4415, device='cuda:0', grad_fn=) cls_loss: tensor(4.8072, device='cuda:0', grad_fn=) cls_loss: tensor(5.7344, device='cuda:0', grad_fn=) cls_loss: tensor(9.4205, device='cuda:0', grad_fn=) cls_loss: tensor(0.0113, device='cuda:0', grad_fn=) cls_loss: tensor(9.1797, device='cuda:0', grad_fn=) cls_loss: tensor(8.7357, device='cuda:0', grad_fn=) cls_loss: tensor(1.4409, device='cuda:0', grad_fn=) cls_loss: tensor(0.2914, device='cuda:0', grad_fn=) cls_loss: tensor(4.2577, device='cuda:0', grad_fn=) cls_loss: tensor(0.0102, device='cuda:0', grad_fn=) cls_loss: tensor(5.9049, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(23.5156, device='cuda:0', grad_fn=) cls_loss: tensor(2.0156, device='cuda:0', grad_fn=) cls_loss: tensor(0.9980, device='cuda:0', grad_fn=) cls_loss: tensor(1.5980, device='cuda:0', grad_fn=) cls_loss: tensor(8.2853, device='cuda:0', grad_fn=) cls_loss: tensor(3.2396, device='cuda:0', grad_fn=) cls_loss: tensor(17.5547, device='cuda:0', grad_fn=) cls_loss: tensor(4.7125, device='cuda:0', grad_fn=) cls_loss: tensor(2.8796, device='cuda:0', grad_fn=) cls_loss: tensor(17.2038, device='cuda:0', grad_fn=) cls_loss: tensor(1.7163, device='cuda:0', grad_fn=) cls_loss: tensor(16.7873, device='cuda:0', grad_fn=) cls_loss: tensor(11.6703, device='cuda:0', grad_fn=) cls_loss: tensor(6.1760, device='cuda:0', grad_fn=) cls_loss: tensor(9.6429, device='cuda:0', grad_fn=) cls_loss: tensor(5.3490, device='cuda:0', grad_fn=) cls_loss: tensor(1.9469, device='cuda:0', grad_fn=) cls_loss: tensor(3.3686, device='cuda:0', grad_fn=) cls_loss: tensor(2.4827, device='cuda:0', grad_fn=) cls_loss: tensor(9.0186, device='cuda:0', grad_fn=) cls_loss: tensor(8.4948, device='cuda:0', grad_fn=) cls_loss: tensor(7.8542, device='cuda:0', grad_fn=) cls_loss: tensor(0.1042, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(11.8472, device='cuda:0', grad_fn=) cls_loss: tensor(11.6823, device='cuda:0', grad_fn=) cls_loss: tensor(9.4141, device='cuda:0', grad_fn=) cls_loss: tensor(0.0734, device='cuda:0', grad_fn=) cls_loss: tensor(1.0645, device='cuda:0', grad_fn=) cls_loss: tensor(0.4192, device='cuda:0', grad_fn=) cls_loss: tensor(12.8991, device='cuda:0', grad_fn=) cls_loss: tensor(7.1502, device='cuda:0', grad_fn=) cls_loss: tensor(11.0729, device='cuda:0', grad_fn=) cls_loss: tensor(16.5872, device='cuda:0', grad_fn=) cls_loss: tensor(1.7119, device='cuda:0', grad_fn=) cls_loss: tensor(19.4049, device='cuda:0', grad_fn=) cls_loss: tensor(2.4810, device='cuda:0', grad_fn=) cls_loss: tensor(3.3760, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.6077, device='cuda:0', grad_fn=) cls_loss: tensor(1.0319, device='cuda:0', grad_fn=) cls_loss: tensor(11.4950, device='cuda:0', grad_fn=) cls_loss: tensor(24.6309, device='cuda:0', grad_fn=) cls_loss: tensor(3.2344, device='cuda:0', grad_fn=) cls_loss: tensor(10.7244, device='cuda:0', grad_fn=) cls_loss: tensor(15.6605, device='cuda:0', grad_fn=) cls_loss: tensor(16.7064, device='cuda:0', grad_fn=) cls_loss: tensor(10.1146, device='cuda:0', grad_fn=) cls_loss: tensor(2.1648, device='cuda:0', grad_fn=) cls_loss: tensor(4.3370, device='cuda:0', grad_fn=) cls_loss: tensor(5.8855, device='cuda:0', grad_fn=) cls_loss: tensor(6.3114, device='cuda:0', grad_fn=) cls_loss: tensor(1.1464, device='cuda:0', grad_fn=) cls_loss: tensor(2.9922e-05, device='cuda:0', grad_fn=) cls_loss: tensor(10.1925, device='cuda:0', grad_fn=) cls_loss: tensor(8.0094, device='cuda:0', grad_fn=) cls_loss: tensor(0.0031, device='cuda:0', grad_fn=) cls_loss: tensor(6.7053, device='cuda:0', grad_fn=) cls_loss: tensor(10.4371, device='cuda:0', grad_fn=) cls_loss: tensor(3.9393, device='cuda:0', grad_fn=) cls_loss: tensor(3.6745, device='cuda:0', grad_fn=) cls_loss: tensor(3.7712, device='cuda:0', grad_fn=) cls_loss: tensor(9.5479, device='cuda:0', grad_fn=) cls_loss: tensor(23.6750, device='cuda:0', grad_fn=) cls_loss: tensor(10.9434, device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.9200, device='cuda:0', grad_fn=) cls_loss: tensor(6.0892, device='cuda:0', grad_fn=) cls_loss: tensor(0.0309, device='cuda:0', grad_fn=) cls_loss: tensor(39.4583, device='cuda:0', grad_fn=) cls_loss: tensor(11.2708, device='cuda:0', grad_fn=) cls_loss: tensor(31.9609, device='cuda:0', grad_fn=) cls_loss: tensor(9.8249, device='cuda:0', grad_fn=) cls_loss: tensor(16.3997, device='cuda:0', grad_fn=) cls_loss: tensor(6.7865, device='cuda:0', grad_fn=) cls_loss: tensor(0.8835, device='cuda:0', grad_fn=) cls_loss: tensor(11.5104, device='cuda:0', grad_fn=) cls_loss: tensor(14.2083, device='cuda:0', grad_fn=) cls_loss: tensor(4.3281, device='cuda:0', grad_fn=) cls_loss: tensor(9.5312, device='cuda:0', grad_fn=) cls_loss: tensor(2.3317, device='cuda:0', grad_fn=) cls_loss: tensor(3.9089, device='cuda:0', grad_fn=) cls_loss: tensor(0.0071, device='cuda:0', grad_fn=) cls_loss: tensor(8.4784, device='cuda:0', grad_fn=) cls_loss: tensor(2.1875, device='cuda:0', grad_fn=) cls_loss: tensor(1.3574, device='cuda:0', grad_fn=) cls_loss: tensor(10.4934, device='cuda:0', grad_fn=) cls_loss: tensor(7.6799, device='cuda:0', grad_fn=) cls_loss: tensor(2.0357, device='cuda:0', grad_fn=) cls_loss: tensor(1.5466, device='cuda:0', grad_fn=) cls_loss: tensor(6.8312, device='cuda:0', grad_fn=) cls_loss: tensor(4.9097, device='cuda:0', grad_fn=) cls_loss: tensor(9.3022, device='cuda:0', grad_fn=) cls_loss: tensor(21.6758, device='cuda:0', grad_fn=) cls_loss: tensor(8.0790, device='cuda:0', grad_fn=) cls_loss: tensor(10.7370, device='cuda:0', grad_fn=) cls_loss: tensor(18.8994, device='cuda:0', grad_fn=) cls_loss: tensor(4.6459, device='cuda:0', grad_fn=) cls_loss: tensor(1.7805, device='cuda:0', grad_fn=) cls_loss: tensor(9.6875, device='cuda:0', grad_fn=) cls_loss: tensor(17.0999, device='cuda:0', grad_fn=) cls_loss: tensor(5.5316, device='cuda:0', grad_fn=) cls_loss: tensor(11.5190, device='cuda:0', grad_fn=) cls_loss: tensor(5.2225, device='cuda:0', grad_fn=) cls_loss: tensor(6.5565e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.8672, device='cuda:0', grad_fn=) cls_loss: tensor(8.0761, device='cuda:0', grad_fn=) cls_loss: tensor(13.1353, device='cuda:0', grad_fn=) cls_loss: tensor(10.5169, device='cuda:0', grad_fn=) cls_loss: tensor(1.7787, device='cuda:0', grad_fn=) cls_loss: tensor(6.4677, device='cuda:0', grad_fn=) cls_loss: tensor(1.7448, device='cuda:0', grad_fn=) cls_loss: tensor(17.0208, device='cuda:0', grad_fn=) cls_loss: tensor(14.2741, device='cuda:0', grad_fn=) cls_loss: tensor(12.1761, device='cuda:0', grad_fn=) cls_loss: tensor(7.6225, device='cuda:0', grad_fn=) cls_loss: tensor(7.8777e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.6533, device='cuda:0', grad_fn=) cls_loss: tensor(5.9635, device='cuda:0', grad_fn=) cls_loss: tensor(22.5258, device='cuda:0', grad_fn=) cls_loss: tensor(12.2950, device='cuda:0', grad_fn=) cls_loss: tensor(0.5653, device='cuda:0', grad_fn=) cls_loss: tensor(5.0719, device='cuda:0', grad_fn=) cls_loss: tensor(13.3756, device='cuda:0', grad_fn=) cls_loss: tensor(0.0231, device='cuda:0', grad_fn=) cls_loss: tensor(14.6576, device='cuda:0', grad_fn=) cls_loss: tensor(5.4893, device='cuda:0', grad_fn=) cls_loss: tensor(10.4193, device='cuda:0', grad_fn=) cls_loss: tensor(1.3113e-06, device='cuda:0', grad_fn=) cls_loss: tensor(14.0755, device='cuda:0', grad_fn=) cls_loss: tensor(0.0705, device='cuda:0', grad_fn=) cls_loss: tensor(0.8191, device='cuda:0', grad_fn=) cls_loss: tensor(14.9922, device='cuda:0', grad_fn=) cls_loss: tensor(8.9609, device='cuda:0', grad_fn=) cls_loss: tensor(6.1056, device='cuda:0', grad_fn=) cls_loss: tensor(6.3305, device='cuda:0', grad_fn=) cls_loss: tensor(0.7598, device='cuda:0', grad_fn=) cls_loss: tensor(2.5431e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.3733, device='cuda:0', grad_fn=) cls_loss: tensor(4.4062, device='cuda:0', grad_fn=) cls_loss: tensor(7.9471, device='cuda:0', grad_fn=) cls_loss: tensor(10.5917, device='cuda:0', grad_fn=) cls_loss: tensor(0.0062, device='cuda:0', grad_fn=) cls_loss: tensor(1.3914, device='cuda:0', grad_fn=) cls_loss: tensor(3.2969, device='cuda:0', grad_fn=) cls_loss: tensor(0.8442, device='cuda:0', grad_fn=) cls_loss: tensor(3.5560, device='cuda:0', grad_fn=) cls_loss: tensor(6.0092, device='cuda:0', grad_fn=) cls_loss: tensor(0.0048, device='cuda:0', grad_fn=) cls_loss: tensor(29.8678, device='cuda:0', grad_fn=) cls_loss: tensor(3.4195, device='cuda:0', grad_fn=) cls_loss: tensor(1.5625, device='cuda:0', grad_fn=) cls_loss: tensor(0.8242, device='cuda:0', grad_fn=) cls_loss: tensor(0.0477, device='cuda:0', grad_fn=) cls_loss: tensor(0.6443, device='cuda:0', grad_fn=) cls_loss: tensor(3.0523, device='cuda:0', grad_fn=) cls_loss: tensor(4.0591, device='cuda:0', grad_fn=) cls_loss: tensor(4.4896, device='cuda:0', grad_fn=) cls_loss: tensor(5.8333, device='cuda:0', grad_fn=) cls_loss: tensor(20.5466, device='cuda:0', grad_fn=) cls_loss: tensor(10.5044, device='cuda:0', grad_fn=) cls_loss: tensor(9.0892, device='cuda:0', grad_fn=) cls_loss: tensor(5.2054, device='cuda:0', grad_fn=) cls_loss: tensor(2.0742, device='cuda:0', grad_fn=) cls_loss: tensor(0.5381, device='cuda:0', grad_fn=) cls_loss: tensor(2.1324, device='cuda:0', grad_fn=) cls_loss: tensor(6.3841, device='cuda:0', grad_fn=) cls_loss: tensor(0.7033, device='cuda:0', grad_fn=) cls_loss: tensor(0.6604, device='cuda:0', grad_fn=) cls_loss: tensor(0.6730, device='cuda:0', grad_fn=) cls_loss: tensor(0.0030, device='cuda:0', grad_fn=) cls_loss: tensor(15.9594, device='cuda:0', grad_fn=) cls_loss: tensor(2.7100e-05, device='cuda:0', grad_fn=) cls_loss: tensor(13.0859, device='cuda:0', grad_fn=) cls_loss: tensor(9.2656, device='cuda:0', grad_fn=) cls_loss: tensor(3.6675, device='cuda:0', grad_fn=) cls_loss: tensor(39.3151, device='cuda:0', grad_fn=) cls_loss: tensor(6.0521, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.6380, device='cuda:0', grad_fn=) cls_loss: tensor(3.9792, device='cuda:0', grad_fn=) cls_loss: tensor(16.5840, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(18.2604, device='cuda:0', grad_fn=) cls_loss: tensor(4.1920, device='cuda:0', grad_fn=) cls_loss: tensor(13.4916, device='cuda:0', grad_fn=) cls_loss: tensor(12.1998, device='cuda:0', grad_fn=) cls_loss: tensor(7.8933, device='cuda:0', grad_fn=) cls_loss: tensor(8.5143, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.4958, device='cuda:0', grad_fn=) cls_loss: tensor(14.9167, device='cuda:0', grad_fn=) cls_loss: tensor(15.8158, device='cuda:0', grad_fn=) cls_loss: tensor(0.0066, device='cuda:0', grad_fn=) cls_loss: tensor(6.0156, device='cuda:0', grad_fn=) cls_loss: tensor(0.1756, device='cuda:0', grad_fn=) cls_loss: tensor(0.1838, device='cuda:0', grad_fn=) cls_loss: tensor(8.0397, device='cuda:0', grad_fn=) cls_loss: tensor(8.7152, device='cuda:0', grad_fn=) cls_loss: tensor(8.6850, device='cuda:0', grad_fn=) cls_loss: tensor(15.2419, device='cuda:0', grad_fn=) cls_loss: tensor(5.0326, device='cuda:0', grad_fn=) cls_loss: tensor(4.0405, device='cuda:0', grad_fn=) cls_loss: tensor(9.5430, device='cuda:0', grad_fn=) cls_loss: tensor(16.6875, device='cuda:0', grad_fn=) cls_loss: tensor(13.1042, device='cuda:0', grad_fn=) cls_loss: tensor(7.5299, device='cuda:0', grad_fn=) cls_loss: tensor(5.5645, device='cuda:0', grad_fn=) cls_loss: tensor(22.6590, device='cuda:0', grad_fn=) cls_loss: tensor(0.0199, device='cuda:0', grad_fn=) cls_loss: tensor(7.6448, device='cuda:0', grad_fn=) cls_loss: tensor(6.4255, device='cuda:0', grad_fn=) cls_loss: tensor(13.2031, device='cuda:0', grad_fn=) cls_loss: tensor(5.3125, device='cuda:0', grad_fn=) cls_loss: tensor(0.1860, device='cuda:0', grad_fn=) cls_loss: tensor(3.4945, device='cuda:0', grad_fn=) cls_loss: tensor(11.0105, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(12.3591, device='cuda:0', grad_fn=) cls_loss: tensor(0.5732, device='cuda:0', grad_fn=) cls_loss: tensor(2.0977, device='cuda:0', grad_fn=) cls_loss: tensor(7.4883, device='cuda:0', grad_fn=) cls_loss: tensor(3.3385, device='cuda:0', grad_fn=) cls_loss: tensor(22.2168, device='cuda:0', grad_fn=) cls_loss: tensor(0.6994, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(3.4457, device='cuda:0', grad_fn=) cls_loss: tensor(12.6354, device='cuda:0', grad_fn=) cls_loss: tensor(9.6914, device='cuda:0', grad_fn=) cls_loss: tensor(5.2756, device='cuda:0', grad_fn=) cls_loss: tensor(0.1323, device='cuda:0', grad_fn=) cls_loss: tensor(18.9355, device='cuda:0', grad_fn=) cls_loss: tensor(6.3543, device='cuda:0', grad_fn=) cls_loss: tensor(5.2566, device='cuda:0', grad_fn=) 0.0001 changing lr ---------------------saving model at epoch 0---------------------------------------------------- epoch 0, time 301.10, cls_loss 7.5080 306 cls_loss: tensor(7.7917, device='cuda:0', grad_fn=) cls_loss: tensor(3.1354, device='cuda:0', grad_fn=) cls_loss: tensor(3.0104, device='cuda:0', grad_fn=) cls_loss: tensor(14.9401, device='cuda:0', grad_fn=) cls_loss: tensor(3.2783e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.8073, device='cuda:0', grad_fn=) cls_loss: tensor(6.3574, device='cuda:0', grad_fn=) cls_loss: tensor(3.0165, device='cuda:0', grad_fn=) cls_loss: tensor(13.1868, device='cuda:0', grad_fn=) cls_loss: tensor(12.8099, device='cuda:0', grad_fn=) cls_loss: tensor(1.3040, device='cuda:0', grad_fn=) cls_loss: tensor(4.9460, device='cuda:0', grad_fn=) cls_loss: tensor(18.5547, device='cuda:0', grad_fn=) cls_loss: tensor(7.0354, device='cuda:0', grad_fn=) cls_loss: tensor(2.1458, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(6.2068e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.6548, device='cuda:0', grad_fn=) cls_loss: tensor(9.4225, device='cuda:0', grad_fn=) cls_loss: tensor(2.3176, device='cuda:0', grad_fn=) cls_loss: tensor(3.1392e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0755, device='cuda:0', grad_fn=) cls_loss: tensor(6.3341, device='cuda:0', grad_fn=) cls_loss: tensor(6.5565e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.6943, device='cuda:0', grad_fn=) cls_loss: tensor(5.1790, device='cuda:0', grad_fn=) cls_loss: tensor(1.4896, device='cuda:0', grad_fn=) cls_loss: tensor(5.7031, device='cuda:0', grad_fn=) cls_loss: tensor(0.8270, device='cuda:0', grad_fn=) cls_loss: tensor(0.3695, device='cuda:0', grad_fn=) cls_loss: tensor(0.5430, device='cuda:0', grad_fn=) cls_loss: tensor(5.1967, device='cuda:0', grad_fn=) cls_loss: tensor(0.0152, device='cuda:0', grad_fn=) cls_loss: tensor(2.2949, device='cuda:0', grad_fn=) cls_loss: tensor(1.5299e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1615, device='cuda:0', grad_fn=) cls_loss: tensor(3.6068, device='cuda:0', grad_fn=) cls_loss: tensor(2.2457, device='cuda:0', grad_fn=) cls_loss: tensor(4.3717, device='cuda:0', grad_fn=) cls_loss: tensor(2.7396, device='cuda:0', grad_fn=) cls_loss: tensor(3.1719, device='cuda:0', grad_fn=) cls_loss: tensor(1.0510e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.4334, device='cuda:0', grad_fn=) cls_loss: tensor(6.5565e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.0346, device='cuda:0', grad_fn=) cls_loss: tensor(7.2083, device='cuda:0', grad_fn=) cls_loss: tensor(2.9540, device='cuda:0', grad_fn=) cls_loss: tensor(7.3763, device='cuda:0', grad_fn=) cls_loss: tensor(3.9319, device='cuda:0', grad_fn=) cls_loss: tensor(7.7163, device='cuda:0', grad_fn=) cls_loss: tensor(5.1289, device='cuda:0', grad_fn=) cls_loss: tensor(7.0905, device='cuda:0', grad_fn=) cls_loss: tensor(3.9583, device='cuda:0', grad_fn=) cls_loss: tensor(5.1636, device='cuda:0', grad_fn=) cls_loss: tensor(12.1877, device='cuda:0', grad_fn=) cls_loss: tensor(5.2576, device='cuda:0', grad_fn=) cls_loss: tensor(7.6297, device='cuda:0', grad_fn=) cls_loss: tensor(2.5313, device='cuda:0', grad_fn=) cls_loss: tensor(1.0272e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.3856e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.3984, device='cuda:0', grad_fn=) cls_loss: tensor(0.0035, device='cuda:0', grad_fn=) cls_loss: tensor(9.8864e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.1559, device='cuda:0', grad_fn=) cls_loss: tensor(0.0486, device='cuda:0', grad_fn=) cls_loss: tensor(2.3438, device='cuda:0', grad_fn=) cls_loss: tensor(7.5632, device='cuda:0', grad_fn=) cls_loss: tensor(9.1908, device='cuda:0', grad_fn=) cls_loss: tensor(4.7656, device='cuda:0', grad_fn=) cls_loss: tensor(14.1272, device='cuda:0', grad_fn=) cls_loss: tensor(1.1772, device='cuda:0', grad_fn=) cls_loss: tensor(4.7684e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.8366, device='cuda:0', grad_fn=) cls_loss: tensor(0.0596, device='cuda:0', grad_fn=) cls_loss: tensor(8.7578, device='cuda:0', grad_fn=) cls_loss: tensor(4.5417, device='cuda:0', grad_fn=) cls_loss: tensor(1.9219, device='cuda:0', grad_fn=) cls_loss: tensor(2.3646, device='cuda:0', grad_fn=) cls_loss: tensor(13.7507, device='cuda:0', grad_fn=) cls_loss: tensor(7.0798, device='cuda:0', grad_fn=) cls_loss: tensor(4.7761, device='cuda:0', grad_fn=) cls_loss: tensor(2.3040, device='cuda:0', grad_fn=) cls_loss: tensor(8.8255, device='cuda:0', grad_fn=) cls_loss: tensor(4.2458e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.3906, device='cuda:0', grad_fn=) cls_loss: tensor(8.5392, device='cuda:0', grad_fn=) cls_loss: tensor(5.3158, device='cuda:0', grad_fn=) cls_loss: tensor(2.6882e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.1825, device='cuda:0', grad_fn=) cls_loss: tensor(4.6563, device='cuda:0', grad_fn=) cls_loss: tensor(2.9075, device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.5547, device='cuda:0', grad_fn=) cls_loss: tensor(0.8893, device='cuda:0', grad_fn=) cls_loss: tensor(1.1112, device='cuda:0', grad_fn=) cls_loss: tensor(2.8809e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.5104, device='cuda:0', grad_fn=) cls_loss: tensor(8.1458, device='cuda:0', grad_fn=) cls_loss: tensor(1.5326, device='cuda:0', grad_fn=) cls_loss: tensor(1.9133e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.0044, device='cuda:0', grad_fn=) cls_loss: tensor(3.0495, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.2448, device='cuda:0', grad_fn=) cls_loss: tensor(2.9219, device='cuda:0', grad_fn=) cls_loss: tensor(4.2708, device='cuda:0', grad_fn=) cls_loss: tensor(7.2604, device='cuda:0', grad_fn=) cls_loss: tensor(1.8492, device='cuda:0', grad_fn=) cls_loss: tensor(4.5484, device='cuda:0', grad_fn=) cls_loss: tensor(13.1198, device='cuda:0', grad_fn=) cls_loss: tensor(3.9141, device='cuda:0', grad_fn=) cls_loss: tensor(13.5834, device='cuda:0', grad_fn=) cls_loss: tensor(3.3824, device='cuda:0', grad_fn=) cls_loss: tensor(0.6560, device='cuda:0', grad_fn=) cls_loss: tensor(7.8112, device='cuda:0', grad_fn=) cls_loss: tensor(10.0065, device='cuda:0', grad_fn=) cls_loss: tensor(0.1840, device='cuda:0', grad_fn=) cls_loss: tensor(5.9975, device='cuda:0', grad_fn=) cls_loss: tensor(24.5313, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(2.8932, device='cuda:0', grad_fn=) cls_loss: tensor(6.0335, device='cuda:0', grad_fn=) cls_loss: tensor(6.7881, device='cuda:0', grad_fn=) cls_loss: tensor(2.0026, device='cuda:0', grad_fn=) cls_loss: tensor(0.0284, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9349, device='cuda:0', grad_fn=) cls_loss: tensor(1.0456, device='cuda:0', grad_fn=) cls_loss: tensor(2.7266, device='cuda:0', grad_fn=) cls_loss: tensor(1.9167, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.2376, device='cuda:0', grad_fn=) cls_loss: tensor(11.2891, device='cuda:0', grad_fn=) cls_loss: tensor(6.1979, device='cuda:0', grad_fn=) cls_loss: tensor(1.9690, device='cuda:0', grad_fn=) cls_loss: tensor(11.5286, device='cuda:0', grad_fn=) cls_loss: tensor(1.3229, device='cuda:0', grad_fn=) cls_loss: tensor(4.9167, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(5.1773, device='cuda:0', grad_fn=) cls_loss: tensor(0.5070, device='cuda:0', grad_fn=) cls_loss: tensor(9.0014, device='cuda:0', grad_fn=) cls_loss: tensor(2.9595, device='cuda:0', grad_fn=) cls_loss: tensor(3.7470, device='cuda:0', grad_fn=) cls_loss: tensor(0.0163, device='cuda:0', grad_fn=) cls_loss: tensor(0.2480, device='cuda:0', grad_fn=) cls_loss: tensor(4.9164, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.6811, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(23.9271, device='cuda:0', grad_fn=) cls_loss: tensor(4.0534, device='cuda:0', grad_fn=) cls_loss: tensor(9.1406, device='cuda:0', grad_fn=) cls_loss: tensor(0.0861, device='cuda:0', grad_fn=) cls_loss: tensor(1.2474, device='cuda:0', grad_fn=) cls_loss: tensor(1.4676, device='cuda:0', grad_fn=) cls_loss: tensor(14.2166, device='cuda:0', grad_fn=) cls_loss: tensor(2.0716, device='cuda:0', grad_fn=) cls_loss: tensor(9.3307, device='cuda:0', grad_fn=) cls_loss: tensor(1.7630, device='cuda:0', grad_fn=) cls_loss: tensor(7.2637, device='cuda:0', grad_fn=) cls_loss: tensor(5.6605e-05, device='cuda:0', grad_fn=) cls_loss: tensor(26.4141, device='cuda:0', grad_fn=) cls_loss: tensor(1.3594, device='cuda:0', grad_fn=) cls_loss: tensor(1.0628, device='cuda:0', grad_fn=) cls_loss: tensor(4.0680, device='cuda:0', grad_fn=) cls_loss: tensor(0.0189, device='cuda:0', grad_fn=) cls_loss: tensor(12.7318, device='cuda:0', grad_fn=) cls_loss: tensor(2.7418e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.5213, device='cuda:0', grad_fn=) cls_loss: tensor(3.6719, device='cuda:0', grad_fn=) cls_loss: tensor(4.3360, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(2.1348, device='cuda:0', grad_fn=) cls_loss: tensor(0.2114, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.1901, device='cuda:0', grad_fn=) cls_loss: tensor(0.7852, device='cuda:0', grad_fn=) cls_loss: tensor(6.2307e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0031, device='cuda:0', grad_fn=) cls_loss: tensor(1.8229, device='cuda:0', grad_fn=) cls_loss: tensor(10.4118, device='cuda:0', grad_fn=) cls_loss: tensor(21.1185, device='cuda:0', grad_fn=) cls_loss: tensor(2.5706, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(5.9167, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.0221, device='cuda:0', grad_fn=) cls_loss: tensor(5.0758, device='cuda:0', grad_fn=) cls_loss: tensor(0.0068, device='cuda:0', grad_fn=) cls_loss: tensor(8.7891, device='cuda:0', grad_fn=) cls_loss: tensor(12.1224, device='cuda:0', grad_fn=) cls_loss: tensor(1.0730, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(4.8132, device='cuda:0', grad_fn=) cls_loss: tensor(5.5469, device='cuda:0', grad_fn=) cls_loss: tensor(3.8771, device='cuda:0', grad_fn=) cls_loss: tensor(0.0219, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(4.6341, device='cuda:0', grad_fn=) cls_loss: tensor(4.3281, device='cuda:0', grad_fn=) cls_loss: tensor(0.0058, device='cuda:0', grad_fn=) cls_loss: tensor(1.0729e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.5613, device='cuda:0', grad_fn=) cls_loss: tensor(0.3276, device='cuda:0', grad_fn=) cls_loss: tensor(9.1693, device='cuda:0', grad_fn=) cls_loss: tensor(11.8021, device='cuda:0', grad_fn=) cls_loss: tensor(18.4313, device='cuda:0', grad_fn=) cls_loss: tensor(6.4691e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0593, device='cuda:0', grad_fn=) cls_loss: tensor(10.1328, device='cuda:0', grad_fn=) cls_loss: tensor(4.2346, device='cuda:0', grad_fn=) cls_loss: tensor(5.5917, device='cuda:0', grad_fn=) cls_loss: tensor(3.8208, device='cuda:0', grad_fn=) cls_loss: tensor(8.1525, device='cuda:0', grad_fn=) cls_loss: tensor(1.3751, device='cuda:0', grad_fn=) cls_loss: tensor(6.1591e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.5768, device='cuda:0', grad_fn=) cls_loss: tensor(4.5744, device='cuda:0', grad_fn=) cls_loss: tensor(1.1478, device='cuda:0', grad_fn=) cls_loss: tensor(13.9935, device='cuda:0', grad_fn=) cls_loss: tensor(0.4038, device='cuda:0', grad_fn=) cls_loss: tensor(7.9688, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.8076, device='cuda:0', grad_fn=) cls_loss: tensor(1.0944, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.5833, device='cuda:0', grad_fn=) cls_loss: tensor(6.4249, device='cuda:0', grad_fn=) cls_loss: tensor(0.4753, device='cuda:0', grad_fn=) cls_loss: tensor(5.5750e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0338, device='cuda:0', grad_fn=) cls_loss: tensor(0.0058, device='cuda:0', grad_fn=) cls_loss: tensor(3.9914, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.5803, device='cuda:0', grad_fn=) cls_loss: tensor(1.8802, device='cuda:0', grad_fn=) cls_loss: tensor(3.1953, device='cuda:0', grad_fn=) cls_loss: tensor(8.4227, device='cuda:0', grad_fn=) cls_loss: tensor(2.3586, device='cuda:0', grad_fn=) cls_loss: tensor(14.4421, device='cuda:0', grad_fn=) cls_loss: tensor(5.0222, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.0499, device='cuda:0', grad_fn=) cls_loss: tensor(8.2292, device='cuda:0', grad_fn=) cls_loss: tensor(1.5339, device='cuda:0', grad_fn=) cls_loss: tensor(4.7891, device='cuda:0', grad_fn=) cls_loss: tensor(0.0882, device='cuda:0', grad_fn=) cls_loss: tensor(0.0048, device='cuda:0', grad_fn=) cls_loss: tensor(2.3776, device='cuda:0', grad_fn=) cls_loss: tensor(7.4094, device='cuda:0', grad_fn=) cls_loss: tensor(12.4375, device='cuda:0', grad_fn=) cls_loss: tensor(0.0521, device='cuda:0', grad_fn=) cls_loss: tensor(1.8285, device='cuda:0', grad_fn=) cls_loss: tensor(5.5911, device='cuda:0', grad_fn=) cls_loss: tensor(14.8646, device='cuda:0', grad_fn=) cls_loss: tensor(3.3210, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.0173e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.8740, device='cuda:0', grad_fn=) cls_loss: tensor(2.6159, device='cuda:0', grad_fn=) cls_loss: tensor(9.6510, device='cuda:0', grad_fn=) cls_loss: tensor(5.5938, device='cuda:0', grad_fn=) cls_loss: tensor(0.0632, device='cuda:0', grad_fn=) cls_loss: tensor(1.6966, device='cuda:0', grad_fn=) cls_loss: tensor(20.5742, device='cuda:0', grad_fn=) cls_loss: tensor(1.5234, device='cuda:0', grad_fn=) cls_loss: tensor(9.0675, device='cuda:0', grad_fn=) cls_loss: tensor(2.9888, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.7219, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(4.6074, device='cuda:0', grad_fn=) cls_loss: tensor(4.0786, device='cuda:0', grad_fn=) cls_loss: tensor(1.8160e-05, device='cuda:0', grad_fn=) cls_loss: tensor(26.7812, device='cuda:0', grad_fn=) cls_loss: tensor(4.7684e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(8.0426, device='cuda:0', grad_fn=) cls_loss: tensor(1.2814, device='cuda:0', grad_fn=) cls_loss: tensor(7.2644, device='cuda:0', grad_fn=) cls_loss: tensor(10.7331, device='cuda:0', grad_fn=) cls_loss: tensor(8.3796, device='cuda:0', grad_fn=) cls_loss: tensor(7.4758, device='cuda:0', grad_fn=) cls_loss: tensor(5.8083, device='cuda:0', grad_fn=) cls_loss: tensor(12.8438, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.9129, device='cuda:0', grad_fn=) cls_loss: tensor(2.8854, device='cuda:0', grad_fn=) cls_loss: tensor(3.6513, device='cuda:0', grad_fn=) cls_loss: tensor(5.1841, device='cuda:0', grad_fn=) cls_loss: tensor(8.5387, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.4033, device='cuda:0', grad_fn=) cls_loss: tensor(5.5260, device='cuda:0', grad_fn=) cls_loss: tensor(1.5169, device='cuda:0', grad_fn=) cls_loss: tensor(1.9896, device='cuda:0', grad_fn=) cls_loss: tensor(5.9180, device='cuda:0', grad_fn=) cls_loss: tensor(0.3965, device='cuda:0', grad_fn=) cls_loss: tensor(1.1502, device='cuda:0', grad_fn=) cls_loss: tensor(5.7318, device='cuda:0', grad_fn=) 9.972609476841367e-05 changing lr epoch 1, time 297.68, cls_loss 4.1508 306 cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0067, device='cuda:0', grad_fn=) cls_loss: tensor(4.1693, device='cuda:0', grad_fn=) cls_loss: tensor(3.8229, device='cuda:0', grad_fn=) cls_loss: tensor(3.8325, device='cuda:0', grad_fn=) cls_loss: tensor(2.8107, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.7683e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.1460e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.2112, device='cuda:0', grad_fn=) cls_loss: tensor(1.9980, device='cuda:0', grad_fn=) cls_loss: tensor(11.9948, device='cuda:0', grad_fn=) cls_loss: tensor(0.0522, device='cuda:0', grad_fn=) cls_loss: tensor(0.1828, device='cuda:0', grad_fn=) cls_loss: tensor(3.4740, device='cuda:0', grad_fn=) cls_loss: tensor(3.8362, device='cuda:0', grad_fn=) cls_loss: tensor(3.4723, device='cuda:0', grad_fn=) cls_loss: tensor(5.5868, device='cuda:0', grad_fn=) cls_loss: tensor(1.6771, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.1526e-07, device='cuda:0', grad_fn=) cls_loss: tensor(8.3844e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0045, device='cuda:0', grad_fn=) cls_loss: tensor(12.8879, device='cuda:0', grad_fn=) cls_loss: tensor(5.0903, device='cuda:0', grad_fn=) cls_loss: tensor(0.3364, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.8294, device='cuda:0', grad_fn=) cls_loss: tensor(6.4688, device='cuda:0', grad_fn=) cls_loss: tensor(8.7083, device='cuda:0', grad_fn=) cls_loss: tensor(4.5808, device='cuda:0', grad_fn=) cls_loss: tensor(34.0391, device='cuda:0', grad_fn=) cls_loss: tensor(6.7422, device='cuda:0', grad_fn=) cls_loss: tensor(0.1842, device='cuda:0', grad_fn=) cls_loss: tensor(10.9610, device='cuda:0', grad_fn=) cls_loss: tensor(4.7292, device='cuda:0', grad_fn=) cls_loss: tensor(1.3776, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.6890e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.6494, device='cuda:0', grad_fn=) cls_loss: tensor(0.0103, device='cuda:0', grad_fn=) cls_loss: tensor(6.3022, device='cuda:0', grad_fn=) cls_loss: tensor(1.3031, device='cuda:0', grad_fn=) cls_loss: tensor(2.0464e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(11.3542, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(8.3346, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.1890e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.8574, device='cuda:0', grad_fn=) cls_loss: tensor(9.3177, device='cuda:0', grad_fn=) cls_loss: tensor(1.0317, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.3857, device='cuda:0', grad_fn=) cls_loss: tensor(4.5911, device='cuda:0', grad_fn=) cls_loss: tensor(0.3242, device='cuda:0', grad_fn=) cls_loss: tensor(5.0104, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0137, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.1771, device='cuda:0', grad_fn=) cls_loss: tensor(1.3208, device='cuda:0', grad_fn=) cls_loss: tensor(0.8138, device='cuda:0', grad_fn=) cls_loss: tensor(13.4072, device='cuda:0', grad_fn=) cls_loss: tensor(8.4336, device='cuda:0', grad_fn=) cls_loss: tensor(16.3691, device='cuda:0', grad_fn=) cls_loss: tensor(0.7267, device='cuda:0', grad_fn=) cls_loss: tensor(1.6085, device='cuda:0', grad_fn=) cls_loss: tensor(5.3594, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.5280, device='cuda:0', grad_fn=) cls_loss: tensor(3.1380, device='cuda:0', grad_fn=) cls_loss: tensor(0.4736, device='cuda:0', grad_fn=) cls_loss: tensor(20.6589, device='cuda:0', grad_fn=) cls_loss: tensor(12.6606, device='cuda:0', grad_fn=) cls_loss: tensor(0.0086, device='cuda:0', grad_fn=) cls_loss: tensor(4.7982e-05, device='cuda:0', grad_fn=) cls_loss: tensor(26.3333, device='cuda:0', grad_fn=) cls_loss: tensor(5.9479, device='cuda:0', grad_fn=) cls_loss: tensor(0.4416, device='cuda:0', grad_fn=) cls_loss: tensor(2.4948, device='cuda:0', grad_fn=) cls_loss: tensor(7.0990, device='cuda:0', grad_fn=) cls_loss: tensor(4.5863, device='cuda:0', grad_fn=) cls_loss: tensor(4.2135, device='cuda:0', grad_fn=) cls_loss: tensor(0.5566, device='cuda:0', grad_fn=) cls_loss: tensor(0.4082, device='cuda:0', grad_fn=) cls_loss: tensor(2.3400, device='cuda:0', grad_fn=) cls_loss: tensor(5.2500, device='cuda:0', grad_fn=) cls_loss: tensor(1.2448, device='cuda:0', grad_fn=) cls_loss: tensor(10.7673, device='cuda:0', grad_fn=) cls_loss: tensor(1.9740, device='cuda:0', grad_fn=) cls_loss: tensor(2.6592, device='cuda:0', grad_fn=) cls_loss: tensor(4.4027, device='cuda:0', grad_fn=) cls_loss: tensor(1.6525, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.0039, device='cuda:0', grad_fn=) cls_loss: tensor(9.8229, device='cuda:0', grad_fn=) cls_loss: tensor(5.1328, device='cuda:0', grad_fn=) cls_loss: tensor(0.3636, device='cuda:0', grad_fn=) cls_loss: tensor(1.8925, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(1.0331e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.1667, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.8229, device='cuda:0', grad_fn=) cls_loss: tensor(4.4518, device='cuda:0', grad_fn=) cls_loss: tensor(7.1953, device='cuda:0', grad_fn=) cls_loss: tensor(3.0353, device='cuda:0', grad_fn=) cls_loss: tensor(1.0905, device='cuda:0', grad_fn=) cls_loss: tensor(1.9471e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.9447, device='cuda:0', grad_fn=) cls_loss: tensor(7.8487, device='cuda:0', grad_fn=) cls_loss: tensor(4.8158, device='cuda:0', grad_fn=) cls_loss: tensor(4.5131, device='cuda:0', grad_fn=) cls_loss: tensor(0.1768, device='cuda:0', grad_fn=) cls_loss: tensor(1.7682, device='cuda:0', grad_fn=) cls_loss: tensor(6.3803, device='cuda:0', grad_fn=) cls_loss: tensor(5.0769, device='cuda:0', grad_fn=) cls_loss: tensor(8.9264, device='cuda:0', grad_fn=) cls_loss: tensor(0.0185, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.7083, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.4036, device='cuda:0', grad_fn=) cls_loss: tensor(1.0730, device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.0077, device='cuda:0', grad_fn=) cls_loss: tensor(0.2323, device='cuda:0', grad_fn=) cls_loss: tensor(0.3228, device='cuda:0', grad_fn=) cls_loss: tensor(8.4922, device='cuda:0', grad_fn=) cls_loss: tensor(2.1169, device='cuda:0', grad_fn=) cls_loss: tensor(5.4453, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.5631e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.2487, device='cuda:0', grad_fn=) cls_loss: tensor(1.7625, device='cuda:0', grad_fn=) cls_loss: tensor(6.6875, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.0928e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.5446, device='cuda:0', grad_fn=) cls_loss: tensor(2.2302, device='cuda:0', grad_fn=) cls_loss: tensor(2.3151, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.6820, device='cuda:0', grad_fn=) cls_loss: tensor(0.0218, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0184, device='cuda:0', grad_fn=) cls_loss: tensor(3.9811, device='cuda:0', grad_fn=) cls_loss: tensor(3.1172, device='cuda:0', grad_fn=) cls_loss: tensor(3.5942e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.5488, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(9.0729, device='cuda:0', grad_fn=) cls_loss: tensor(7.6146, device='cuda:0', grad_fn=) cls_loss: tensor(0.0058, device='cuda:0', grad_fn=) cls_loss: tensor(13.5260, device='cuda:0', grad_fn=) cls_loss: tensor(2.4427, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.2113, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(14.6703, device='cuda:0', grad_fn=) cls_loss: tensor(0.1812, device='cuda:0', grad_fn=) cls_loss: tensor(4.8876e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.3853, device='cuda:0', grad_fn=) cls_loss: tensor(3.1927, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.7645e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.9870e-05, device='cuda:0', grad_fn=) cls_loss: tensor(10.7396, device='cuda:0', grad_fn=) cls_loss: tensor(2.3047, device='cuda:0', grad_fn=) cls_loss: tensor(3.6862, device='cuda:0', grad_fn=) cls_loss: tensor(1.2155, device='cuda:0', grad_fn=) cls_loss: tensor(0.6423, device='cuda:0', grad_fn=) cls_loss: tensor(4.5300e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3411e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.3099, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.7783, device='cuda:0', grad_fn=) cls_loss: tensor(0.1528, device='cuda:0', grad_fn=) cls_loss: tensor(0.0645, device='cuda:0', grad_fn=) cls_loss: tensor(9.9225, device='cuda:0', grad_fn=) cls_loss: tensor(6.9618, device='cuda:0', grad_fn=) cls_loss: tensor(3.1231, device='cuda:0', grad_fn=) cls_loss: tensor(10.4076, device='cuda:0', grad_fn=) cls_loss: tensor(6.2630, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.5666, device='cuda:0', grad_fn=) cls_loss: tensor(0.7255, device='cuda:0', grad_fn=) cls_loss: tensor(0.1742, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0398, device='cuda:0', grad_fn=) cls_loss: tensor(10.1784, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(5.3021, device='cuda:0', grad_fn=) cls_loss: tensor(0.0203, device='cuda:0', grad_fn=) cls_loss: tensor(5.1636, device='cuda:0', grad_fn=) cls_loss: tensor(1.5573, device='cuda:0', grad_fn=) cls_loss: tensor(1.2363, device='cuda:0', grad_fn=) cls_loss: tensor(4.3685, device='cuda:0', grad_fn=) cls_loss: tensor(0.0044, device='cuda:0', grad_fn=) cls_loss: tensor(1.5538, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.8861, device='cuda:0', grad_fn=) cls_loss: tensor(0.3078, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.5080, device='cuda:0', grad_fn=) cls_loss: tensor(2.6549, device='cuda:0', grad_fn=) cls_loss: tensor(0.0544, device='cuda:0', grad_fn=) cls_loss: tensor(4.0886, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.2090, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(3.6495, device='cuda:0', grad_fn=) cls_loss: tensor(2.3750, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0245, device='cuda:0', grad_fn=) cls_loss: tensor(2.3190, device='cuda:0', grad_fn=) cls_loss: tensor(5.9525e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.2734, device='cuda:0', grad_fn=) cls_loss: tensor(6.1110, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.4635, device='cuda:0', grad_fn=) cls_loss: tensor(5.8214e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.6797, device='cuda:0', grad_fn=) cls_loss: tensor(6.9069, device='cuda:0', grad_fn=) cls_loss: tensor(5.3655, device='cuda:0', grad_fn=) cls_loss: tensor(1.2896, device='cuda:0', grad_fn=) cls_loss: tensor(4.0730e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.1865, device='cuda:0', grad_fn=) cls_loss: tensor(8.9327e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.0104, device='cuda:0', grad_fn=) cls_loss: tensor(3.8854, device='cuda:0', grad_fn=) cls_loss: tensor(0.9095, device='cuda:0', grad_fn=) cls_loss: tensor(0.7415, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.4834, device='cuda:0', grad_fn=) cls_loss: tensor(5.2951, device='cuda:0', grad_fn=) cls_loss: tensor(7.7161, device='cuda:0', grad_fn=) cls_loss: tensor(15.3555, device='cuda:0', grad_fn=) cls_loss: tensor(0.7989, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.7957, device='cuda:0', grad_fn=) cls_loss: tensor(1.7484e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.5846, device='cuda:0', grad_fn=) cls_loss: tensor(4.0052, device='cuda:0', grad_fn=) cls_loss: tensor(3.1228, device='cuda:0', grad_fn=) cls_loss: tensor(0.8190, device='cuda:0', grad_fn=) cls_loss: tensor(0.1156, device='cuda:0', grad_fn=) cls_loss: tensor(0.9918, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.3101, device='cuda:0', grad_fn=) cls_loss: tensor(6.4896, device='cuda:0', grad_fn=) cls_loss: tensor(1.8350, device='cuda:0', grad_fn=) cls_loss: tensor(14.7599, device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9332e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.5966, device='cuda:0', grad_fn=) cls_loss: tensor(11.2946, device='cuda:0', grad_fn=) cls_loss: tensor(5.5631e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.6989, device='cuda:0', grad_fn=) cls_loss: tensor(3.9839, device='cuda:0', grad_fn=) cls_loss: tensor(0.4665, device='cuda:0', grad_fn=) cls_loss: tensor(0.2075, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.1432, device='cuda:0', grad_fn=) cls_loss: tensor(3.9297, device='cuda:0', grad_fn=) cls_loss: tensor(4.7458, device='cuda:0', grad_fn=) cls_loss: tensor(0.8294, device='cuda:0', grad_fn=) cls_loss: tensor(9.8620, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(1.2867, device='cuda:0', grad_fn=) cls_loss: tensor(2.3126, device='cuda:0', grad_fn=) cls_loss: tensor(18.4948, device='cuda:0', grad_fn=) cls_loss: tensor(0.0340, device='cuda:0', grad_fn=) cls_loss: tensor(7.7201, device='cuda:0', grad_fn=) cls_loss: tensor(2.0184, device='cuda:0', grad_fn=) cls_loss: tensor(1.1125, device='cuda:0', grad_fn=) cls_loss: tensor(0.0271, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.6162, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(2.4688, device='cuda:0', grad_fn=) cls_loss: tensor(1.4479, device='cuda:0', grad_fn=) cls_loss: tensor(10.1198, device='cuda:0', grad_fn=) cls_loss: tensor(3.2057, device='cuda:0', grad_fn=) cls_loss: tensor(3.3385, device='cuda:0', grad_fn=) cls_loss: tensor(3.3698, device='cuda:0', grad_fn=) cls_loss: tensor(3.4635, device='cuda:0', grad_fn=) cls_loss: tensor(1.8181, device='cuda:0', grad_fn=) 9.890738003669029e-05 changing lr epoch 2, time 302.80, cls_loss 2.9470 306 cls_loss: tensor(0.4344, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0566, device='cuda:0', grad_fn=) cls_loss: tensor(2.6693, device='cuda:0', grad_fn=) cls_loss: tensor(3.4173e-06, device='cuda:0', grad_fn=) cls_loss: tensor(30.5417, device='cuda:0', grad_fn=) cls_loss: tensor(0.2383, device='cuda:0', grad_fn=) cls_loss: tensor(0.1673, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.6298, device='cuda:0', grad_fn=) cls_loss: tensor(3.8743e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.0067e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(4.6458, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.5156, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.6032, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0872, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.0246e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.2467e-05, device='cuda:0', grad_fn=) cls_loss: tensor(12.1016, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.6296, device='cuda:0', grad_fn=) cls_loss: tensor(2.2018, device='cuda:0', grad_fn=) cls_loss: tensor(2.5586, device='cuda:0', grad_fn=) cls_loss: tensor(5.2849e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.7904, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.4779, device='cuda:0', grad_fn=) cls_loss: tensor(4.1922e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.2755, device='cuda:0', grad_fn=) cls_loss: tensor(11.0521, device='cuda:0', grad_fn=) cls_loss: tensor(0.0573, device='cuda:0', grad_fn=) cls_loss: tensor(0.3014, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0990, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.9080, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.8333, device='cuda:0', grad_fn=) cls_loss: tensor(0.0070, device='cuda:0', grad_fn=) cls_loss: tensor(0.8652, device='cuda:0', grad_fn=) cls_loss: tensor(1.3932, device='cuda:0', grad_fn=) cls_loss: tensor(4.4622, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.7708, device='cuda:0', grad_fn=) cls_loss: tensor(1.1615, device='cuda:0', grad_fn=) cls_loss: tensor(2.0286, device='cuda:0', grad_fn=) cls_loss: tensor(2.1146, device='cuda:0', grad_fn=) cls_loss: tensor(0.1378, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.1105, device='cuda:0', grad_fn=) cls_loss: tensor(5.6783e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.8147e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.2826, device='cuda:0', grad_fn=) cls_loss: tensor(0.0177, device='cuda:0', grad_fn=) cls_loss: tensor(20.8870, device='cuda:0', grad_fn=) cls_loss: tensor(8.5605, device='cuda:0', grad_fn=) cls_loss: tensor(4.3427, device='cuda:0', grad_fn=) cls_loss: tensor(0.3363, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.2195, device='cuda:0', grad_fn=) cls_loss: tensor(0.0223, device='cuda:0', grad_fn=) cls_loss: tensor(0.8887, device='cuda:0', grad_fn=) cls_loss: tensor(2.5573, device='cuda:0', grad_fn=) cls_loss: tensor(4.2682, device='cuda:0', grad_fn=) cls_loss: tensor(0.1853, device='cuda:0', grad_fn=) cls_loss: tensor(1.9909, device='cuda:0', grad_fn=) cls_loss: tensor(2.2174, device='cuda:0', grad_fn=) cls_loss: tensor(0.5957, device='cuda:0', grad_fn=) cls_loss: tensor(6.0755, device='cuda:0', grad_fn=) cls_loss: tensor(0.0288, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.7474, device='cuda:0', grad_fn=) cls_loss: tensor(0.2075, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.8080e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.5000, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.4740, device='cuda:0', grad_fn=) cls_loss: tensor(3.1094, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.7774, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9670e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.4219, device='cuda:0', grad_fn=) cls_loss: tensor(2.5590e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.2813, device='cuda:0', grad_fn=) cls_loss: tensor(9.8149e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.9604e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.7161, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.4395, device='cuda:0', grad_fn=) cls_loss: tensor(5.3283, device='cuda:0', grad_fn=) cls_loss: tensor(4.6641, device='cuda:0', grad_fn=) cls_loss: tensor(0.8389, device='cuda:0', grad_fn=) cls_loss: tensor(3.7041, device='cuda:0', grad_fn=) cls_loss: tensor(0.9544, device='cuda:0', grad_fn=) cls_loss: tensor(2.4631, device='cuda:0', grad_fn=) cls_loss: tensor(0.0098, device='cuda:0', grad_fn=) cls_loss: tensor(1.4792, device='cuda:0', grad_fn=) cls_loss: tensor(7.8255, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.7618e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.8952, device='cuda:0', grad_fn=) cls_loss: tensor(1.2812, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.2878, device='cuda:0', grad_fn=) cls_loss: tensor(0.8098, device='cuda:0', grad_fn=) cls_loss: tensor(17.1893, device='cuda:0', grad_fn=) cls_loss: tensor(5.1589, device='cuda:0', grad_fn=) cls_loss: tensor(8.4775, device='cuda:0', grad_fn=) cls_loss: tensor(2.1455, device='cuda:0', grad_fn=) cls_loss: tensor(0.2386, device='cuda:0', grad_fn=) cls_loss: tensor(0.1808, device='cuda:0', grad_fn=) cls_loss: tensor(0.7026, device='cuda:0', grad_fn=) cls_loss: tensor(3.3893, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0540, device='cuda:0', grad_fn=) cls_loss: tensor(0.6215, device='cuda:0', grad_fn=) cls_loss: tensor(2.5630e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.8099, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.0067e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.5499e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.1836, device='cuda:0', grad_fn=) cls_loss: tensor(5.5469, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.2731, device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.4217, device='cuda:0', grad_fn=) cls_loss: tensor(4.1888, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.3405, device='cuda:0', grad_fn=) cls_loss: tensor(4.0934, device='cuda:0', grad_fn=) cls_loss: tensor(0.1305, device='cuda:0', grad_fn=) cls_loss: tensor(0.7624, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1276, device='cuda:0', grad_fn=) cls_loss: tensor(0.0181, device='cuda:0', grad_fn=) cls_loss: tensor(5.3889, device='cuda:0', grad_fn=) cls_loss: tensor(1.7318, device='cuda:0', grad_fn=) cls_loss: tensor(1.2389, device='cuda:0', grad_fn=) cls_loss: tensor(0.0035, device='cuda:0', grad_fn=) cls_loss: tensor(2.5042, device='cuda:0', grad_fn=) cls_loss: tensor(0.0062, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0464, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(1.2716e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0070, device='cuda:0', grad_fn=) cls_loss: tensor(0.0228, device='cuda:0', grad_fn=) cls_loss: tensor(11.1250, device='cuda:0', grad_fn=) cls_loss: tensor(2.0729, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.0729, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.4962, device='cuda:0', grad_fn=) cls_loss: tensor(3.5417, device='cuda:0', grad_fn=) cls_loss: tensor(0.0932, device='cuda:0', grad_fn=) cls_loss: tensor(2.7041e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.1252, device='cuda:0', grad_fn=) cls_loss: tensor(1.2318e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.7318, device='cuda:0', grad_fn=) cls_loss: tensor(1.5286, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.5782, device='cuda:0', grad_fn=) cls_loss: tensor(2.7751, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.4896, device='cuda:0', grad_fn=) cls_loss: tensor(1.3705, device='cuda:0', grad_fn=) cls_loss: tensor(1.1556, device='cuda:0', grad_fn=) cls_loss: tensor(0.1481, device='cuda:0', grad_fn=) cls_loss: tensor(14.8685, device='cuda:0', grad_fn=) cls_loss: tensor(5.3646, device='cuda:0', grad_fn=) cls_loss: tensor(0.0168, device='cuda:0', grad_fn=) cls_loss: tensor(5.0690, device='cuda:0', grad_fn=) cls_loss: tensor(0.8001, device='cuda:0', grad_fn=) cls_loss: tensor(1.8203, device='cuda:0', grad_fn=) cls_loss: tensor(1.3541, device='cuda:0', grad_fn=) cls_loss: tensor(5.4141, device='cuda:0', grad_fn=) cls_loss: tensor(9.6257, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0528, device='cuda:0', grad_fn=) cls_loss: tensor(1.7311, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0042, device='cuda:0', grad_fn=) cls_loss: tensor(3.8868, device='cuda:0', grad_fn=) cls_loss: tensor(5.3086, device='cuda:0', grad_fn=) cls_loss: tensor(1.5924, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(2.1797, device='cuda:0', grad_fn=) cls_loss: tensor(6.1354, device='cuda:0', grad_fn=) cls_loss: tensor(1.2563, device='cuda:0', grad_fn=) cls_loss: tensor(2.8705, device='cuda:0', grad_fn=) cls_loss: tensor(1.0391, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.7219e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.9539e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.8411, device='cuda:0', grad_fn=) cls_loss: tensor(0.5996, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.2018, device='cuda:0', grad_fn=) cls_loss: tensor(3.3262, device='cuda:0', grad_fn=) cls_loss: tensor(6.7552e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.4808, device='cuda:0', grad_fn=) cls_loss: tensor(1.1094, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(0.0147, device='cuda:0', grad_fn=) cls_loss: tensor(3.9414, device='cuda:0', grad_fn=) cls_loss: tensor(1.0967e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.6725, device='cuda:0', grad_fn=) cls_loss: tensor(2.4974e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.5706, device='cuda:0', grad_fn=) cls_loss: tensor(2.1042, device='cuda:0', grad_fn=) cls_loss: tensor(4.8818, device='cuda:0', grad_fn=) cls_loss: tensor(1.9389, device='cuda:0', grad_fn=) cls_loss: tensor(4.8490, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(3.9922, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.6047e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.1094, device='cuda:0', grad_fn=) cls_loss: tensor(2.9699, device='cuda:0', grad_fn=) cls_loss: tensor(1.3699, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.5329, device='cuda:0', grad_fn=) cls_loss: tensor(5.3644e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.1784, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0331e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.2020e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(3.6955e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.8461, device='cuda:0', grad_fn=) cls_loss: tensor(2.3372, device='cuda:0', grad_fn=) cls_loss: tensor(0.7110, device='cuda:0', grad_fn=) cls_loss: tensor(5.5631e-07, device='cuda:0', grad_fn=) cls_loss: tensor(16.4661, device='cuda:0', grad_fn=) cls_loss: tensor(2.7605, device='cuda:0', grad_fn=) cls_loss: tensor(4.4776, device='cuda:0', grad_fn=) cls_loss: tensor(3.6616, device='cuda:0', grad_fn=) cls_loss: tensor(0.7389, device='cuda:0', grad_fn=) cls_loss: tensor(1.4212, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(0.3776, device='cuda:0', grad_fn=) cls_loss: tensor(1.4417, device='cuda:0', grad_fn=) cls_loss: tensor(0.2211, device='cuda:0', grad_fn=) cls_loss: tensor(1.9609, device='cuda:0', grad_fn=) cls_loss: tensor(8.4805, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.4896, device='cuda:0', grad_fn=) cls_loss: tensor(0.5107, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.0033, device='cuda:0', grad_fn=) cls_loss: tensor(9.3268, device='cuda:0', grad_fn=) cls_loss: tensor(0.0192, device='cuda:0', grad_fn=) cls_loss: tensor(0.0679, device='cuda:0', grad_fn=) cls_loss: tensor(1.5185, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.0963, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(10.6146, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.8655, device='cuda:0', grad_fn=) cls_loss: tensor(1.1719, device='cuda:0', grad_fn=) cls_loss: tensor(0.9593, device='cuda:0', grad_fn=) cls_loss: tensor(7.1347, device='cuda:0', grad_fn=) cls_loss: tensor(6.8880, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.5677e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0063, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9479, device='cuda:0', grad_fn=) cls_loss: tensor(5.8333, device='cuda:0', grad_fn=) cls_loss: tensor(0.4609, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) 9.755282581475769e-05 changing lr ---------------------saving model at epoch 3---------------------------------------------------- epoch 3, time 300.19, cls_loss 1.8753 306 cls_loss: tensor(6.3925, device='cuda:0', grad_fn=) cls_loss: tensor(5.2129, device='cuda:0', grad_fn=) cls_loss: tensor(0.1018, device='cuda:0', grad_fn=) cls_loss: tensor(0.9209, device='cuda:0', grad_fn=) cls_loss: tensor(0.3613, device='cuda:0', grad_fn=) cls_loss: tensor(5.7022e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.4795, device='cuda:0', grad_fn=) cls_loss: tensor(1.1126e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.1354, device='cuda:0', grad_fn=) cls_loss: tensor(0.0072, device='cuda:0', grad_fn=) cls_loss: tensor(4.1849, device='cuda:0', grad_fn=) cls_loss: tensor(1.6146, device='cuda:0', grad_fn=) cls_loss: tensor(1.2368, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.3117, device='cuda:0', grad_fn=) cls_loss: tensor(0.1825, device='cuda:0', grad_fn=) cls_loss: tensor(0.0060, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(2.6226e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.5827, device='cuda:0', grad_fn=) cls_loss: tensor(0.2588, device='cuda:0', grad_fn=) cls_loss: tensor(1.4505, device='cuda:0', grad_fn=) cls_loss: tensor(5.6146, device='cuda:0', grad_fn=) cls_loss: tensor(3.7135, device='cuda:0', grad_fn=) cls_loss: tensor(5.3099, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0039, device='cuda:0', grad_fn=) cls_loss: tensor(4.3293, device='cuda:0', grad_fn=) cls_loss: tensor(8.6751, device='cuda:0', grad_fn=) cls_loss: tensor(5.4544, device='cuda:0', grad_fn=) cls_loss: tensor(2.3893, device='cuda:0', grad_fn=) cls_loss: tensor(2.8267, device='cuda:0', grad_fn=) cls_loss: tensor(2.1471, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0807, device='cuda:0', grad_fn=) cls_loss: tensor(0.6496, device='cuda:0', grad_fn=) cls_loss: tensor(2.3880, device='cuda:0', grad_fn=) cls_loss: tensor(1.9004, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.1929, device='cuda:0', grad_fn=) cls_loss: tensor(4.2318, device='cuda:0', grad_fn=) cls_loss: tensor(6.3298, device='cuda:0', grad_fn=) cls_loss: tensor(2.4023, device='cuda:0', grad_fn=) cls_loss: tensor(3.0729, device='cuda:0', grad_fn=) cls_loss: tensor(2.3669, device='cuda:0', grad_fn=) cls_loss: tensor(1.1027e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(2.6253, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(9.5148, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0596, device='cuda:0', grad_fn=) cls_loss: tensor(0.0376, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.9590, device='cuda:0', grad_fn=) cls_loss: tensor(2.0133, device='cuda:0', grad_fn=) cls_loss: tensor(3.3734, device='cuda:0', grad_fn=) cls_loss: tensor(0.3776, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.0287, device='cuda:0', grad_fn=) cls_loss: tensor(5.4376, device='cuda:0', grad_fn=) cls_loss: tensor(3.1193e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.3631e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.6823, device='cuda:0', grad_fn=) cls_loss: tensor(6.0938, device='cuda:0', grad_fn=) cls_loss: tensor(7.3945, device='cuda:0', grad_fn=) cls_loss: tensor(7.2122e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.1406, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.9912, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.8959, device='cuda:0', grad_fn=) cls_loss: tensor(0.6417, device='cuda:0', grad_fn=) cls_loss: tensor(1.2318e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.9746, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.2917, device='cuda:0', grad_fn=) cls_loss: tensor(3.1693, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.0169, device='cuda:0', grad_fn=) cls_loss: tensor(9.1003, device='cuda:0', grad_fn=) cls_loss: tensor(0.9225, device='cuda:0', grad_fn=) cls_loss: tensor(4.6338, device='cuda:0', grad_fn=) cls_loss: tensor(2.1823, device='cuda:0', grad_fn=) cls_loss: tensor(1.4102, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.4108, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.2878, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.9882, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.0833, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.4115, device='cuda:0', grad_fn=) cls_loss: tensor(1.1719, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0564, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(8.3438, device='cuda:0', grad_fn=) cls_loss: tensor(2.3687, device='cuda:0', grad_fn=) cls_loss: tensor(6.8179, device='cuda:0', grad_fn=) cls_loss: tensor(2.1049, device='cuda:0', grad_fn=) cls_loss: tensor(6.5625, device='cuda:0', grad_fn=) cls_loss: tensor(1.6354, device='cuda:0', grad_fn=) cls_loss: tensor(4.2604, device='cuda:0', grad_fn=) cls_loss: tensor(5.7031, device='cuda:0', grad_fn=) cls_loss: tensor(1.0462, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.1250, device='cuda:0', grad_fn=) cls_loss: tensor(5.3326e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9175, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.3644e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.3594, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.4220, device='cuda:0', grad_fn=) cls_loss: tensor(0.0050, device='cuda:0', grad_fn=) cls_loss: tensor(1.5417, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.5964, device='cuda:0', grad_fn=) cls_loss: tensor(7.8499, device='cuda:0', grad_fn=) cls_loss: tensor(1.3464, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.5365, device='cuda:0', grad_fn=) cls_loss: tensor(6.3307, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.1683, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.2992, device='cuda:0', grad_fn=) cls_loss: tensor(15.9395, device='cuda:0', grad_fn=) cls_loss: tensor(2.0990, device='cuda:0', grad_fn=) cls_loss: tensor(0.4590, device='cuda:0', grad_fn=) cls_loss: tensor(1.7422, device='cuda:0', grad_fn=) cls_loss: tensor(1.4818, device='cuda:0', grad_fn=) cls_loss: tensor(0.0507, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.4571e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(11.0501, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.1354, device='cuda:0', grad_fn=) cls_loss: tensor(0.0162, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.9157, device='cuda:0', grad_fn=) cls_loss: tensor(1.6789e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0182, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.0929e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.4505e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.4323, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.3075, device='cuda:0', grad_fn=) cls_loss: tensor(1.1151, device='cuda:0', grad_fn=) cls_loss: tensor(0.0216, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.4395, device='cuda:0', grad_fn=) cls_loss: tensor(9.5225, device='cuda:0', grad_fn=) cls_loss: tensor(0.0064, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.8835, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.4458, device='cuda:0', grad_fn=) cls_loss: tensor(2.1628, device='cuda:0', grad_fn=) cls_loss: tensor(5.1172, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.3838, device='cuda:0', grad_fn=) cls_loss: tensor(4.8779, device='cuda:0', grad_fn=) cls_loss: tensor(7.5000, device='cuda:0', grad_fn=) cls_loss: tensor(0.3008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0148, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.7813, device='cuda:0', grad_fn=) cls_loss: tensor(0.3268, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.7676, device='cuda:0', grad_fn=) cls_loss: tensor(0.0114, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.7889e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.4034, device='cuda:0', grad_fn=) cls_loss: tensor(2.3226, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.4479, device='cuda:0', grad_fn=) cls_loss: tensor(3.0955e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.5938, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.4451e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(3.7174, device='cuda:0', grad_fn=) cls_loss: tensor(2.8724, device='cuda:0', grad_fn=) cls_loss: tensor(1.7288, device='cuda:0', grad_fn=) cls_loss: tensor(0.2709, device='cuda:0', grad_fn=) cls_loss: tensor(6.3164, device='cuda:0', grad_fn=) cls_loss: tensor(0.8730, device='cuda:0', grad_fn=) cls_loss: tensor(1.4062, device='cuda:0', grad_fn=) cls_loss: tensor(9.4573e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3568, device='cuda:0', grad_fn=) cls_loss: tensor(5.0677, device='cuda:0', grad_fn=) cls_loss: tensor(2.3372, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.8736, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0132, device='cuda:0', grad_fn=) cls_loss: tensor(0.5799, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.1135, device='cuda:0', grad_fn=) cls_loss: tensor(0.0715, device='cuda:0', grad_fn=) cls_loss: tensor(0.3052, device='cuda:0', grad_fn=) cls_loss: tensor(1.2614, device='cuda:0', grad_fn=) cls_loss: tensor(3.2786, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9792, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(1.7734, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1224, device='cuda:0', grad_fn=) cls_loss: tensor(0.1706, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.5885, device='cuda:0', grad_fn=) cls_loss: tensor(5.2891, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.8776, device='cuda:0', grad_fn=) cls_loss: tensor(0.4561, device='cuda:0', grad_fn=) cls_loss: tensor(0.5889, device='cuda:0', grad_fn=) cls_loss: tensor(1.1654, device='cuda:0', grad_fn=) cls_loss: tensor(0.0079, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(6.5312, device='cuda:0', grad_fn=) cls_loss: tensor(3.4062, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.7285e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0402, device='cuda:0', grad_fn=) cls_loss: tensor(0.9225, device='cuda:0', grad_fn=) cls_loss: tensor(5.3203, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.1549, device='cuda:0', grad_fn=) cls_loss: tensor(4.6094, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.2139e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0279, device='cuda:0', grad_fn=) cls_loss: tensor(0.0697, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.3061e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1126e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.8977, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.6292e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.9974, device='cuda:0', grad_fn=) cls_loss: tensor(6.0625, device='cuda:0', grad_fn=) cls_loss: tensor(7.8490, device='cuda:0', grad_fn=) cls_loss: tensor(3.5311, device='cuda:0', grad_fn=) cls_loss: tensor(6.5386e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.4305e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.7318, device='cuda:0', grad_fn=) cls_loss: tensor(1.0912, device='cuda:0', grad_fn=) cls_loss: tensor(5.6742, device='cuda:0', grad_fn=) cls_loss: tensor(0.0150, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.9303, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.8169e-05, device='cuda:0', grad_fn=) 9.567727288213003e-05 changing lr ---------------------saving model at epoch 4---------------------------------------------------- epoch 4, time 302.37, cls_loss 1.6264 306 cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.4857, device='cuda:0', grad_fn=) cls_loss: tensor(2.1895e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.5498, device='cuda:0', grad_fn=) cls_loss: tensor(15.3538, device='cuda:0', grad_fn=) cls_loss: tensor(7.3512e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.6250, device='cuda:0', grad_fn=) cls_loss: tensor(4.0758, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.3510e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(6.7188, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(13.2866, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1895, device='cuda:0', grad_fn=) cls_loss: tensor(3.8743e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.0928e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.8255, device='cuda:0', grad_fn=) cls_loss: tensor(3.7397, device='cuda:0', grad_fn=) cls_loss: tensor(0.0977, device='cuda:0', grad_fn=) cls_loss: tensor(0.1407, device='cuda:0', grad_fn=) cls_loss: tensor(2.5104, device='cuda:0', grad_fn=) cls_loss: tensor(0.1497, device='cuda:0', grad_fn=) cls_loss: tensor(0.8600, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.6133e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.2113, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.1790, device='cuda:0', grad_fn=) cls_loss: tensor(0.0221, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(11.7006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0312, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.4710, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.1466, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.2784e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.9870, device='cuda:0', grad_fn=) cls_loss: tensor(1.9300, device='cuda:0', grad_fn=) cls_loss: tensor(0.5986, device='cuda:0', grad_fn=) cls_loss: tensor(5.7618e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.0001e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0124, device='cuda:0', grad_fn=) cls_loss: tensor(5.0363, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(1.7799, device='cuda:0', grad_fn=) cls_loss: tensor(1.1285e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.6797, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.1544, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.6922, device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.6888, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.8750, device='cuda:0', grad_fn=) cls_loss: tensor(4.8478e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0790, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.0911, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(9.2656, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0198, device='cuda:0', grad_fn=) cls_loss: tensor(0.0106, device='cuda:0', grad_fn=) cls_loss: tensor(0.0238, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.2772, device='cuda:0', grad_fn=) cls_loss: tensor(4.3511e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1381, device='cuda:0', grad_fn=) cls_loss: tensor(9.6439, device='cuda:0', grad_fn=) cls_loss: tensor(2.9604e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.6591e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.0443, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.0288, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.8105, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(3.1380, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0144, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0052, device='cuda:0', grad_fn=) cls_loss: tensor(7.9876, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.5837, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.6400e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(0.0061, device='cuda:0', grad_fn=) cls_loss: tensor(2.0105, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0079, device='cuda:0', grad_fn=) cls_loss: tensor(0.2075, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(13.1302, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.5306, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.3249, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.0859, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.4003e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.6332e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0696, device='cuda:0', grad_fn=) cls_loss: tensor(0.7174, device='cuda:0', grad_fn=) cls_loss: tensor(13.4271, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.1198, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0046, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.8294, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.2054e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(5.0108e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9657e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0283, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.8611e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9670e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.4297, device='cuda:0', grad_fn=) cls_loss: tensor(1.7552, device='cuda:0', grad_fn=) cls_loss: tensor(0.7689, device='cuda:0', grad_fn=) cls_loss: tensor(2.7266, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.5885, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(2.8783, device='cuda:0', grad_fn=) cls_loss: tensor(9.1146, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0908, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.9798, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.6680, device='cuda:0', grad_fn=) cls_loss: tensor(5.0677, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(9.2586e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.7161, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(6.2083, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.3229, device='cuda:0', grad_fn=) cls_loss: tensor(0.0085, device='cuda:0', grad_fn=) cls_loss: tensor(2.2054e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.2760, device='cuda:0', grad_fn=) cls_loss: tensor(3.6515, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0612, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9307, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.2708, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(1.3698, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.8854, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.1953, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0194, device='cuda:0', grad_fn=) cls_loss: tensor(0.0123, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.1953, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.3464, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.6035, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9333, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.7812, device='cuda:0', grad_fn=) cls_loss: tensor(0.5934, device='cuda:0', grad_fn=) cls_loss: tensor(4.3333, device='cuda:0', grad_fn=) cls_loss: tensor(0.0212, device='cuda:0', grad_fn=) cls_loss: tensor(0.0042, device='cuda:0', grad_fn=) cls_loss: tensor(0.0697, device='cuda:0', grad_fn=) cls_loss: tensor(0.5254, device='cuda:0', grad_fn=) cls_loss: tensor(6.3021, device='cuda:0', grad_fn=) cls_loss: tensor(8.9193, device='cuda:0', grad_fn=) cls_loss: tensor(0.0392, device='cuda:0', grad_fn=) cls_loss: tensor(0.1156, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0806, device='cuda:0', grad_fn=) cls_loss: tensor(0.5130, device='cuda:0', grad_fn=) cls_loss: tensor(1.2096, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.5262, device='cuda:0', grad_fn=) cls_loss: tensor(5.1657e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.1878, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(1.0990, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(1.3113e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.8490, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.7995, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.3869e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.0134e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.2762, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.2721, device='cuda:0', grad_fn=) cls_loss: tensor(0.7467, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0136, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.6322, device='cuda:0', grad_fn=) cls_loss: tensor(0.3649, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.2616e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9414, device='cuda:0', grad_fn=) cls_loss: tensor(9.8745e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.2584e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0175, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) 9.330127018922194e-05 changing lr ---------------------saving model at epoch 5---------------------------------------------------- epoch 5, time 321.56, cls_loss 0.9863 306 cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.8346, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.8546e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.4206, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.5683e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(9.2785e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.6094, device='cuda:0', grad_fn=) cls_loss: tensor(2.1042, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0053, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.0418e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.0332e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.1930, device='cuda:0', grad_fn=) cls_loss: tensor(4.9671e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.5560, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.4506e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.3049e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1265e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.5934, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0094, device='cuda:0', grad_fn=) cls_loss: tensor(0.0697, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0508, device='cuda:0', grad_fn=) cls_loss: tensor(1.8486, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.2865, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.8750, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.5685, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.6289, device='cuda:0', grad_fn=) cls_loss: tensor(1.3510e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9939, device='cuda:0', grad_fn=) cls_loss: tensor(1.5444, device='cuda:0', grad_fn=) cls_loss: tensor(7.1914, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.7061, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.4988e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.6995, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.1436, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.3965, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(10.6368, device='cuda:0', grad_fn=) cls_loss: tensor(7.6953, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.1927, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0254, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.5732, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.4326, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0376, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0295, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0079, device='cuda:0', grad_fn=) cls_loss: tensor(3.1392e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0350, device='cuda:0', grad_fn=) cls_loss: tensor(0.6012, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.7630, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.7813, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.2751, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.1930, device='cuda:0', grad_fn=) cls_loss: tensor(2.1656e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.3708, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0053, device='cuda:0', grad_fn=) cls_loss: tensor(2.9974, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.5885, device='cuda:0', grad_fn=) cls_loss: tensor(0.0156, device='cuda:0', grad_fn=) cls_loss: tensor(5.7658e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.1094, device='cuda:0', grad_fn=) cls_loss: tensor(6.4844, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.8776, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.3727, device='cuda:0', grad_fn=) cls_loss: tensor(1.3771, device='cuda:0', grad_fn=) cls_loss: tensor(7.8281e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.5088, device='cuda:0', grad_fn=) cls_loss: tensor(1.8175, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.3568, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.8750, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.1432, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.1195, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0177, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.8529, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.3835, device='cuda:0', grad_fn=) cls_loss: tensor(0.0446, device='cuda:0', grad_fn=) cls_loss: tensor(1.2448, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0956, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.6849, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.9671e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.1722, device='cuda:0', grad_fn=) cls_loss: tensor(7.0532e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.5812, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.6757e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.2777, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0179, device='cuda:0', grad_fn=) cls_loss: tensor(2.7161, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(6.6757e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.6668, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.5100e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.2293, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.3385, device='cuda:0', grad_fn=) cls_loss: tensor(2.9792, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0719, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.7433e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.6263, device='cuda:0', grad_fn=) cls_loss: tensor(5.0759, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.0663e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(8.9407e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.8125, device='cuda:0', grad_fn=) cls_loss: tensor(2.9402, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.1816, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.2875e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0245, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.2982e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.1624, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.1250, device='cuda:0', grad_fn=) cls_loss: tensor(0.0072, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(1.1782e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.5156, device='cuda:0', grad_fn=) cls_loss: tensor(0.0851, device='cuda:0', grad_fn=) cls_loss: tensor(0.9453, device='cuda:0', grad_fn=) cls_loss: tensor(1.8458e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.6821, device='cuda:0', grad_fn=) cls_loss: tensor(0.0376, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.5111, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9391e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(4.0889e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.6250, device='cuda:0', grad_fn=) cls_loss: tensor(6.8906, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0193, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.1751, device='cuda:0', grad_fn=) cls_loss: tensor(2.6771, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.2124, device='cuda:0', grad_fn=) cls_loss: tensor(0.0429, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.5286, device='cuda:0', grad_fn=) cls_loss: tensor(5.2917, device='cuda:0', grad_fn=) cls_loss: tensor(0.2715, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(1.9062, device='cuda:0', grad_fn=) cls_loss: tensor(3.4974, device='cuda:0', grad_fn=) cls_loss: tensor(1.2689, device='cuda:0', grad_fn=) cls_loss: tensor(0.0037, device='cuda:0', grad_fn=) cls_loss: tensor(1.7148, device='cuda:0', grad_fn=) cls_loss: tensor(9.1195e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.6875, device='cuda:0', grad_fn=) cls_loss: tensor(1.8279e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) 9.045084971874738e-05 changing lr epoch 6, time 329.11, cls_loss 0.6325 306 cls_loss: tensor(0.0056, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.3438, device='cuda:0', grad_fn=) cls_loss: tensor(3.6955e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0079, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.8027, device='cuda:0', grad_fn=) cls_loss: tensor(0.0188, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.3682, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(7.9271, device='cuda:0', grad_fn=) cls_loss: tensor(1.6689e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.1657e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.3932, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.7138, device='cuda:0', grad_fn=) cls_loss: tensor(9.8745e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.8646, device='cuda:0', grad_fn=) cls_loss: tensor(0.0710, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0033, device='cuda:0', grad_fn=) cls_loss: tensor(0.9031, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.4656e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.2396, device='cuda:0', grad_fn=) cls_loss: tensor(0.2686, device='cuda:0', grad_fn=) cls_loss: tensor(3.1621, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0766, device='cuda:0', grad_fn=) cls_loss: tensor(0.0032, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0122, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.2785e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.4062, device='cuda:0', grad_fn=) cls_loss: tensor(2.5599, device='cuda:0', grad_fn=) cls_loss: tensor(1.5729, device='cuda:0', grad_fn=) cls_loss: tensor(1.2604, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.9813, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0516, device='cuda:0', grad_fn=) cls_loss: tensor(4.1341, device='cuda:0', grad_fn=) cls_loss: tensor(2.3242, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0223, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.1128e-06, device='cuda:0', grad_fn=) cls_loss: tensor(11.2057, device='cuda:0', grad_fn=) cls_loss: tensor(0.0566, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(1.1881e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.2503, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.5579, device='cuda:0', grad_fn=) cls_loss: tensor(0.1292, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.3307e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0040, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0369, device='cuda:0', grad_fn=) cls_loss: tensor(0.5156, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0034, device='cuda:0', grad_fn=) cls_loss: tensor(0.2783, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.5029, device='cuda:0', grad_fn=) cls_loss: tensor(3.8099, device='cuda:0', grad_fn=) cls_loss: tensor(4.2787, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.5356, device='cuda:0', grad_fn=) cls_loss: tensor(1.0692, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.3151, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.3232, device='cuda:0', grad_fn=) cls_loss: tensor(0.1799, device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.8260, device='cuda:0', grad_fn=) cls_loss: tensor(8.3327e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.1849e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0643, device='cuda:0', grad_fn=) cls_loss: tensor(0.3340, device='cuda:0', grad_fn=) cls_loss: tensor(0.0373, device='cuda:0', grad_fn=) cls_loss: tensor(4.5000, device='cuda:0', grad_fn=) cls_loss: tensor(0.4824, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.1656e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.6042, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.1784, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.4630, device='cuda:0', grad_fn=) cls_loss: tensor(0.3822, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.4571e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.6184, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.0846, device='cuda:0', grad_fn=) cls_loss: tensor(0.1544, device='cuda:0', grad_fn=) cls_loss: tensor(6.1651e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.3644e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.8676e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.8656e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(9.1394e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.6877e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(6.0797e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.3047e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.1313e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(6.1591e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.7618e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0132, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(15.1355, device='cuda:0', grad_fn=) cls_loss: tensor(0.4794, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(1.1543e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(2.1289, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0172, device='cuda:0', grad_fn=) cls_loss: tensor(2.4351, device='cuda:0', grad_fn=) cls_loss: tensor(1.5339, device='cuda:0', grad_fn=) cls_loss: tensor(4.0944, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.4042e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.4439e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0034, device='cuda:0', grad_fn=) cls_loss: tensor(0.1189, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.1546, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.2188, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.8478e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.1657e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0085, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.5098, device='cuda:0', grad_fn=) cls_loss: tensor(0.2281, device='cuda:0', grad_fn=) cls_loss: tensor(2.8984, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.0553, device='cuda:0', grad_fn=) cls_loss: tensor(0.4189, device='cuda:0', grad_fn=) cls_loss: tensor(1.5497e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.5617e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.4703e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.8743e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0038, device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.2189, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.2005, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.9355, device='cuda:0', grad_fn=) cls_loss: tensor(0.2878, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.3438, device='cuda:0', grad_fn=) cls_loss: tensor(2.7676, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.0293e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.8160e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.0466e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.6374, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.4167, device='cuda:0', grad_fn=) cls_loss: tensor(4.3307, device='cuda:0', grad_fn=) cls_loss: tensor(8.1857e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.8802, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.8745e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.5610, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0308, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(10.7670, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(7.8996e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0283, device='cuda:0', grad_fn=) cls_loss: tensor(9.1195e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0271, device='cuda:0', grad_fn=) cls_loss: tensor(1.0840, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.0244, device='cuda:0', grad_fn=) cls_loss: tensor(7.7486e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.9804e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.7023e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0180, device='cuda:0', grad_fn=) cls_loss: tensor(2.0464e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.2318, device='cuda:0', grad_fn=) cls_loss: tensor(5.2552, device='cuda:0', grad_fn=) cls_loss: tensor(0.0874, device='cuda:0', grad_fn=) cls_loss: tensor(8.0466e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.0155, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.5260e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.6392, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.5423, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.8542, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.2252e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.5050e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.2515, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) 8.715724127386972e-05 changing lr ---------------------saving model at epoch 7---------------------------------------------------- epoch 7, time 328.84, cls_loss 0.6575 306 cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.9804e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.5312, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.0061e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.5859, device='cuda:0', grad_fn=) cls_loss: tensor(0.4811, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0071, device='cuda:0', grad_fn=) cls_loss: tensor(4.5697e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0771, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.1586, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(0.0107, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.1394e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0177, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.1291, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.5934, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.1641, device='cuda:0', grad_fn=) cls_loss: tensor(9.3619e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.2180e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.3932, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.0875e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.3822, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.3464, device='cuda:0', grad_fn=) cls_loss: tensor(2.3984, device='cuda:0', grad_fn=) cls_loss: tensor(7.3512e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.2454, device='cuda:0', grad_fn=) cls_loss: tensor(7.2916e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.8359, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1325e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9670e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0037, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0481, device='cuda:0', grad_fn=) cls_loss: tensor(0.3535, device='cuda:0', grad_fn=) cls_loss: tensor(0.0054, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9505, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.9671e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.2786, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.5367e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.9844, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.7493, device='cuda:0', grad_fn=) cls_loss: tensor(0.0032, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.3646, device='cuda:0', grad_fn=) cls_loss: tensor(0.0652, device='cuda:0', grad_fn=) cls_loss: tensor(0.0040, device='cuda:0', grad_fn=) cls_loss: tensor(0.1825, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(3.1690e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.5209, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.1771, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.1484, device='cuda:0', grad_fn=) cls_loss: tensor(4.5697e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.6498, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.7624, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(6.5764e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0064, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.5678, device='cuda:0', grad_fn=) cls_loss: tensor(1.8503, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.9111, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0051, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9525e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(0.0036, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0044, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0964, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.8802, device='cuda:0', grad_fn=) cls_loss: tensor(1.2266, device='cuda:0', grad_fn=) cls_loss: tensor(0.9920, device='cuda:0', grad_fn=) cls_loss: tensor(7.6413e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.5107, device='cuda:0', grad_fn=) cls_loss: tensor(0.1428, device='cuda:0', grad_fn=) cls_loss: tensor(0.1966, device='cuda:0', grad_fn=) cls_loss: tensor(0.0058, device='cuda:0', grad_fn=) cls_loss: tensor(0.2272, device='cuda:0', grad_fn=) cls_loss: tensor(4.5115, device='cuda:0', grad_fn=) cls_loss: tensor(1.4167, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.6276, device='cuda:0', grad_fn=) cls_loss: tensor(3.4323, device='cuda:0', grad_fn=) cls_loss: tensor(0.7415, device='cuda:0', grad_fn=) cls_loss: tensor(0.0344, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.1646, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.1656e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.5781, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9630e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.8942e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0051, device='cuda:0', grad_fn=) cls_loss: tensor(0.1489, device='cuda:0', grad_fn=) cls_loss: tensor(1.2943, device='cuda:0', grad_fn=) cls_loss: tensor(0.0200, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.1128e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(3.8624e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.2544, device='cuda:0', grad_fn=) cls_loss: tensor(0.0106, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.9762, device='cuda:0', grad_fn=) cls_loss: tensor(4.1250, device='cuda:0', grad_fn=) cls_loss: tensor(0.7799, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.2002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0162, device='cuda:0', grad_fn=) cls_loss: tensor(1.8229, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0547, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.6042, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.3139, device='cuda:0', grad_fn=) cls_loss: tensor(0.2835, device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.6797, device='cuda:0', grad_fn=) cls_loss: tensor(0.0566, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.2668, device='cuda:0', grad_fn=) cls_loss: tensor(1.0749e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.7618e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.4676e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.6544, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.6193e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.8229, device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9955e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.5631e-07, device='cuda:0', grad_fn=) 8.345653031794292e-05 changing lr ---------------------saving model at epoch 8---------------------------------------------------- epoch 8, time 330.88, cls_loss 0.3828 306 cls_loss: tensor(0.7207, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.0814, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0606, device='cuda:0', grad_fn=) cls_loss: tensor(0.9850, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.8438e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0184, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.3086, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0180, device='cuda:0', grad_fn=) cls_loss: tensor(0.4837, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.6422, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0106, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.6361e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0522, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.5326, device='cuda:0', grad_fn=) cls_loss: tensor(1.0267, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.5169, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.7943, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1901e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.6364, device='cuda:0', grad_fn=) cls_loss: tensor(3.3816e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0405, device='cuda:0', grad_fn=) cls_loss: tensor(1.4799, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0110, device='cuda:0', grad_fn=) cls_loss: tensor(0.0124, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9818, device='cuda:0', grad_fn=) cls_loss: tensor(1.0411e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.2981e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.2848e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(3.5729, device='cuda:0', grad_fn=) cls_loss: tensor(0.2863, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.6955e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.9085, device='cuda:0', grad_fn=) cls_loss: tensor(0.9909, device='cuda:0', grad_fn=) cls_loss: tensor(9.7354e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.7560, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0191, device='cuda:0', grad_fn=) cls_loss: tensor(8.3712, device='cuda:0', grad_fn=) cls_loss: tensor(9.4970e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0040, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0336, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.5459, device='cuda:0', grad_fn=) cls_loss: tensor(0.0053, device='cuda:0', grad_fn=) cls_loss: tensor(3.8945, device='cuda:0', grad_fn=) cls_loss: tensor(0.1281, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.3206, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.3363, device='cuda:0', grad_fn=) cls_loss: tensor(0.6647, device='cuda:0', grad_fn=) cls_loss: tensor(16.8367, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.6549, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.6226e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0940, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.8490, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(7.9400, device='cuda:0', grad_fn=) cls_loss: tensor(1.4531, device='cuda:0', grad_fn=) cls_loss: tensor(0.0728, device='cuda:0', grad_fn=) cls_loss: tensor(0.0247, device='cuda:0', grad_fn=) cls_loss: tensor(5.1657e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.0573, device='cuda:0', grad_fn=) cls_loss: tensor(0.5280, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.7087e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.2414, device='cuda:0', grad_fn=) cls_loss: tensor(2.6433, device='cuda:0', grad_fn=) cls_loss: tensor(0.0039, device='cuda:0', grad_fn=) cls_loss: tensor(1.0482, device='cuda:0', grad_fn=) cls_loss: tensor(0.3708, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.5167e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.3643e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.2279e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0249, device='cuda:0', grad_fn=) cls_loss: tensor(8.4102e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.6433e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.5499e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0833, device='cuda:0', grad_fn=) cls_loss: tensor(1.3659, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.4889, device='cuda:0', grad_fn=) cls_loss: tensor(0.5505, device='cuda:0', grad_fn=) cls_loss: tensor(1.7552, device='cuda:0', grad_fn=) cls_loss: tensor(2.1459, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.6689e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.7786, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(2.2301, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0102, device='cuda:0', grad_fn=) cls_loss: tensor(2.1680, device='cuda:0', grad_fn=) cls_loss: tensor(3.7214, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.5081, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.1984, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0079, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0510e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.2365e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.2648, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.1924, device='cuda:0', grad_fn=) cls_loss: tensor(3.3626, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.8841, device='cuda:0', grad_fn=) cls_loss: tensor(2.3854, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(2.8880, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.2226, device='cuda:0', grad_fn=) cls_loss: tensor(3.0625, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.3893, device='cuda:0', grad_fn=) cls_loss: tensor(1.1722e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.5065, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(1.6292e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.8750, device='cuda:0', grad_fn=) cls_loss: tensor(0.3052, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.2916e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.0875e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.1042, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0036, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.7057, device='cuda:0', grad_fn=) cls_loss: tensor(1.5299e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.5498e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(1.7126, device='cuda:0', grad_fn=) cls_loss: tensor(2.3683e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.8281, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0551, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0612, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9154, device='cuda:0', grad_fn=) cls_loss: tensor(1.4753, device='cuda:0', grad_fn=) cls_loss: tensor(0.8711, device='cuda:0', grad_fn=) cls_loss: tensor(1.2100e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.2100e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0060, device='cuda:0', grad_fn=) 7.938926261462366e-05 changing lr epoch 9, time 331.22, cls_loss 0.5072 306 cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.1486, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.7708, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.4261, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.1557, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(4.1922e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(2.2451e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.6667, device='cuda:0', grad_fn=) cls_loss: tensor(0.3894, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.7684e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.0195, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(1.5110, device='cuda:0', grad_fn=) cls_loss: tensor(2.4316, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0111, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.5582, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9206, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0145, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.1182, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.1348, device='cuda:0', grad_fn=) cls_loss: tensor(5.2571e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.2344, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.4146e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.2272e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0669e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.0863e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.5199e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.1344, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.3698, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.6272e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.2546, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0132, device='cuda:0', grad_fn=) cls_loss: tensor(8.7420e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.6836, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.3247e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.3332e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.7418e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.5026, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.5590e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.2968e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.0443, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.3561, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.8610e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.1198, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.7816e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.9948, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(4.5697e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.8333, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0723, device='cuda:0', grad_fn=) cls_loss: tensor(2.3167, device='cuda:0', grad_fn=) cls_loss: tensor(2.2054e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0364, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.1466, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1458, device='cuda:0', grad_fn=) cls_loss: tensor(0.1025, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.3644e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.1392e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.9009e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0267, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.4311, device='cuda:0', grad_fn=) cls_loss: tensor(0.0960, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.8503, device='cuda:0', grad_fn=) cls_loss: tensor(1.0710, device='cuda:0', grad_fn=) cls_loss: tensor(2.9405e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.7683e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.4297, device='cuda:0', grad_fn=) cls_loss: tensor(7.6294e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0604, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.6382, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.3326e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.2345, device='cuda:0', grad_fn=) cls_loss: tensor(0.1012, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(7.8201e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.6016, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.3447e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.4154, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1325e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.6623e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.5299e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.9948, device='cuda:0', grad_fn=) cls_loss: tensor(0.0038, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.8551e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(1.9694, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.3963, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.1277, device='cuda:0', grad_fn=) cls_loss: tensor(1.8736e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0248, device='cuda:0', grad_fn=) cls_loss: tensor(0.0128, device='cuda:0', grad_fn=) cls_loss: tensor(0.6751, device='cuda:0', grad_fn=) cls_loss: tensor(2.9604e-06, device='cuda:0', grad_fn=) 7.500000000000001e-05 changing lr ---------------------saving model at epoch 10---------------------------------------------------- epoch 10, time 332.14, cls_loss 0.2086 306 cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.5253e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0250, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.2346e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.6888e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.1657e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.4836e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.8490, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0500, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.2784e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.2109, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.5729, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0208, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.2689, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.7476, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.5961e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.0930e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.2500, device='cuda:0', grad_fn=) cls_loss: tensor(0.0140, device='cuda:0', grad_fn=) cls_loss: tensor(1.0928e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.5035e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.9922e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(0.0484, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.0597e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.5579, device='cuda:0', grad_fn=) cls_loss: tensor(1.7044, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.8934e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0365, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.0067e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.3333, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(4.4948, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.2708, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.4082e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.2132, device='cuda:0', grad_fn=) cls_loss: tensor(7.0870e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.0438e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.2113, device='cuda:0', grad_fn=) cls_loss: tensor(7.0938, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.1591e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.5592, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.6940, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.3710e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.6598, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(7.4148e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.7665e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0074, device='cuda:0', grad_fn=) cls_loss: tensor(0.0095, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0995, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.4792, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0509, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.3815, device='cuda:0', grad_fn=) cls_loss: tensor(0.1934, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.3672, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.1453, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0031, device='cuda:0', grad_fn=) cls_loss: tensor(6.6360e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.3185, device='cuda:0', grad_fn=) cls_loss: tensor(3.0796e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.7533, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.0658, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(3.1392e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0085, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.5630e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9596, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.1807, device='cuda:0', grad_fn=) cls_loss: tensor(1.9332e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.7169, device='cuda:0', grad_fn=) cls_loss: tensor(0.0340, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(4.3551e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.7047e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.5431e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.4333e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.2572e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0113, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0047, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0166, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.1624, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0308, device='cuda:0', grad_fn=) cls_loss: tensor(4.9368, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.2318e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.8086, device='cuda:0', grad_fn=) cls_loss: tensor(0.7109, device='cuda:0', grad_fn=) cls_loss: tensor(1.3177, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.7039, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.3398e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.3512e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.1922e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.2246, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.2581, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.8841, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.1120, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(1.4740, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0308, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.0200e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.1094, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) 7.033683215379003e-05 changing lr ---------------------saving model at epoch 11---------------------------------------------------- epoch 11, time 327.33, cls_loss 0.2434 306 cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0100, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.2280e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.5249, device='cuda:0', grad_fn=) cls_loss: tensor(6.7393e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.7684e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.1383, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.5417, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.6250, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0211, device='cuda:0', grad_fn=) cls_loss: tensor(1.7539, device='cuda:0', grad_fn=) cls_loss: tensor(0.0033, device='cuda:0', grad_fn=) cls_loss: tensor(1.2835e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.9008e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.1657e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.1822e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0143, device='cuda:0', grad_fn=) cls_loss: tensor(2.0215, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0729e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1029, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.1195e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0352, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.5130, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.3184, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(5.5432e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.7116e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1166e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.7703, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.3870, device='cuda:0', grad_fn=) cls_loss: tensor(0.7754, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(9.9540e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.0664e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.8016e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0043, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.5764e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.6068, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.5280, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.4209, device='cuda:0', grad_fn=) cls_loss: tensor(5.8035e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.3317, device='cuda:0', grad_fn=) cls_loss: tensor(7.2718e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.4835e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.6309, device='cuda:0', grad_fn=) cls_loss: tensor(1.8418e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.8936e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.1823, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(5.5631e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.7715, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.0347e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.3195, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.1259e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0482, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.2788, device='cuda:0', grad_fn=) cls_loss: tensor(0.0149, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.6195, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0040, device='cuda:0', grad_fn=) cls_loss: tensor(0.0236, device='cuda:0', grad_fn=) cls_loss: tensor(0.4749, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.7923, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0169, device='cuda:0', grad_fn=) cls_loss: tensor(6.1591e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.5367e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.0677, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0711, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.6757e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.6822e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.3040, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.8438e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0475, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.7552e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0121, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.5259e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.6507, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.4175e-06, device='cuda:0', grad_fn=) 6.545084971874738e-05 changing lr epoch 12, time 329.78, cls_loss 0.1202 306 cls_loss: tensor(8.0665e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.0530e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.9512, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0129, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.4128, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.1750, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1722e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.2876e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.0398e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.1518, device='cuda:0', grad_fn=) cls_loss: tensor(2.3444e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.2914e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0313, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.4320e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.5007, device='cuda:0', grad_fn=) cls_loss: tensor(1.1063, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.2517e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.3942, device='cuda:0', grad_fn=) cls_loss: tensor(0.7014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(7.1208e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0493, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0851, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.2874, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0473, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9935e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.1895e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.6814, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.0466e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.7220e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.3848, device='cuda:0', grad_fn=) cls_loss: tensor(0.7240, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.4010, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.4297, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.2718e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.5499e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.2625, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0928e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1285e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.1113, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0029, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0124, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1643e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.4615, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.1764e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.2418e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0035, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0083, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0201, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.2174e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.7552e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0832, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0331e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9657e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.7420e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.1722, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.3644e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.7433e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0031, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.0425e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.0729e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.4703e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.4792, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.9539e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0030, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9525e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.4637e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.4107e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.8138, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.7083, device='cuda:0', grad_fn=) cls_loss: tensor(4.1326e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.6982e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.3049e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.2708, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0040, device='cuda:0', grad_fn=) cls_loss: tensor(1.2755e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(1.1067e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.1447, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.9121, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.7537e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.7088e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) 6.039558454088797e-05 changing lr epoch 13, time 329.34, cls_loss 0.0916 306 cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.4571e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0040, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.7842e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.2912, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0048, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.4464e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.5934, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.8320, device='cuda:0', grad_fn=) cls_loss: tensor(4.5697e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0051, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0238, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.9225, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1325e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0355, device='cuda:0', grad_fn=) cls_loss: tensor(0.0055, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.4333e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.4702e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0054, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0689e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.5033, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0040, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.8190, device='cuda:0', grad_fn=) cls_loss: tensor(0.0206, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0054, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.9121, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.0902e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0083, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.8477e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0105, device='cuda:0', grad_fn=) cls_loss: tensor(1.0729e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0197, device='cuda:0', grad_fn=) cls_loss: tensor(1.2377, device='cuda:0', grad_fn=) cls_loss: tensor(2.7370, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0302, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.5003e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0110, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.3073, device='cuda:0', grad_fn=) cls_loss: tensor(0.2817, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.5517, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.5167e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.5697e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.2463, device='cuda:0', grad_fn=) cls_loss: tensor(6.7552e-07, device='cuda:0', grad_fn=) cls_loss: tensor(8.9327e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0306, device='cuda:0', grad_fn=) cls_loss: tensor(1.0153e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.0797e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.7095e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.8557e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0771, device='cuda:0', grad_fn=) cls_loss: tensor(0.0116, device='cuda:0', grad_fn=) cls_loss: tensor(8.5672e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.4240e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.1913, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.6875, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0053, device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0815, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0475, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1126e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.5020, device='cuda:0', grad_fn=) cls_loss: tensor(0.3687, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.1630e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.2319e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0257, device='cuda:0', grad_fn=) cls_loss: tensor(1.7484e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0113, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0029, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.2849e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.6955e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.0849e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) 5.522642316338269e-05 changing lr epoch 14, time 333.44, cls_loss 0.0604 306 cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.3444e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.7023e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.8030e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.0134e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.9103e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0040, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.0087e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9332e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.2862e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.1590e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.1460e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.2676e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.6397, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(6.7552e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.0266e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.3724, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.1591e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(2.9870, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0841, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0153, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(1.6292e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.4909, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.3577e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.7734, device='cuda:0', grad_fn=) cls_loss: tensor(1.1524e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.4909, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0088, device='cuda:0', grad_fn=) cls_loss: tensor(7.8837e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.1590e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.7087e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1634, device='cuda:0', grad_fn=) cls_loss: tensor(1.5517e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.7843e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0039, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.2862e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.7618e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.3844e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.7485e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0082, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.5499e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.2431e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.1130, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0307, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.3619e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.8346e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.7751e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.9273e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.7352e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.2052, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.2039, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.2955e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.1869e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.3710e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.1591e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0055, device='cuda:0', grad_fn=) cls_loss: tensor(0.0715, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) 5.000000000000002e-05 changing lr ---------------------saving model at epoch 15---------------------------------------------------- epoch 15, time 329.08, cls_loss 0.0349 306 cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.6159, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.8828, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.5631e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0781, device='cuda:0', grad_fn=) cls_loss: tensor(0.7259, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.3073, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0367, device='cuda:0', grad_fn=) cls_loss: tensor(3.8743e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.7618e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.9746, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.7420e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0221, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.6725, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.5631e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.1128e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.5499e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.5590e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.8676e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.8073, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.1797e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.1784e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0628, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.0664e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9140e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.9671e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.6146, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.6098, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0100, device='cuda:0', grad_fn=) cls_loss: tensor(3.1372e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.1408, device='cuda:0', grad_fn=) cls_loss: tensor(8.5433e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.3429, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.2451e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0173e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.5499e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0938, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1524e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.2266e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9009e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.8034, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.3710e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.2900, device='cuda:0', grad_fn=) cls_loss: tensor(5.3644e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.1007, device='cuda:0', grad_fn=) cls_loss: tensor(2.0313, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.3244, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.8942e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.5928e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.6095e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.8610e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0036, device='cuda:0', grad_fn=) cls_loss: tensor(4.0531e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.0464e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.9219, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1524e-06, device='cuda:0', grad_fn=) 4.4773576836617344e-05 changing lr epoch 16, time 329.02, cls_loss 0.0865 306 cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.5168, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.6272e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.3252e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0102, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.2964, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.5517e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.5497e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.4808, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0095, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(8.9407e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9670e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.1358e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0176, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.3578e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.5157, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.8214e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.1067, device='cuda:0', grad_fn=) cls_loss: tensor(0.3818, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0488, device='cuda:0', grad_fn=) cls_loss: tensor(3.2365e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.1460e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.6789e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.8876e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0072, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0597, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0079, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0041, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0086, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.7618e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.8008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0336, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.6769e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.3776, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0053, device='cuda:0', grad_fn=) cls_loss: tensor(3.7154e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.8047, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(6.5565e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.3578e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.2655, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.4069e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.8676e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0133e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.4813, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.9671e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.7285e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.6757e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0928e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.3949e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.1689, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.3305e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.4505, device='cuda:0', grad_fn=) cls_loss: tensor(0.0056, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.7684e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.5764e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.7041e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0510e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0031, device='cuda:0', grad_fn=) cls_loss: tensor(0.0688, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0306, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0894, device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0150, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.0977, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) 3.9604415459112035e-05 changing lr epoch 17, time 329.91, cls_loss 0.0235 306 cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.5697e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0055, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.7226e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.5697e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.2689, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0048, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0488, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.6890e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0160, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.1717e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.1740, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0710, device='cuda:0', grad_fn=) cls_loss: tensor(0.0258, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0030, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.6690e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.2252e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.4583, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.2652e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1608, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.1182, device='cuda:0', grad_fn=) cls_loss: tensor(0.0047, device='cuda:0', grad_fn=) cls_loss: tensor(6.7949e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.2347e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.2981e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.7684e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0207, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0054, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.1166, device='cuda:0', grad_fn=) cls_loss: tensor(0.4443, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1325e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.6569, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.1526e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.5565e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.2849e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0365, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.2175e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.1319e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.6113, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0045, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.0206e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.9074e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.6067e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9326e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0171, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.6789e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0096, device='cuda:0', grad_fn=) cls_loss: tensor(5.6068e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.4479, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.5697e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.3828, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.2233e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) 3.4549150281252636e-05 changing lr epoch 18, time 327.45, cls_loss 0.0265 306 cls_loss: tensor(8.9407e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.5313e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0107, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.1657e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.2517e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.5565e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.5040e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.3723e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0872, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.2517e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(1.3113e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.6955e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0551, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.8413e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0058, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.5499e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.5267, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.4375, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.6095e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.7354e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.2676e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.2314, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.1591e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0669e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0192, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.4625e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.6346e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.7022e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.0796e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.0597e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0183, device='cuda:0', grad_fn=) cls_loss: tensor(1.1524e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.7219e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(4.4505e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.5034e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.0466e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.2200e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.0994e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) 2.966316784621e-05 changing lr epoch 19, time 330.49, cls_loss 0.0079 306 cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(8.8374e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.7552e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.5697e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0450, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.9460, device='cuda:0', grad_fn=) cls_loss: tensor(4.5697e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9133, device='cuda:0', grad_fn=) cls_loss: tensor(7.1526e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.5697e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.3438, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0928e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.3240e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.0055, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0488, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.5129e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.2637, device='cuda:0', grad_fn=) cls_loss: tensor(1.0215, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.8014e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.5764e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0054, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0288, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.0332e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.2312e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.1640, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.0208, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.6292e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.1590e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.0200e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.9671e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0319, device='cuda:0', grad_fn=) cls_loss: tensor(0.0145, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.6789e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.2200e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.3710e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.5579, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.7684e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.2254e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0232e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1901e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.2848e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.1724e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.1305, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.3444e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.0351, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.7618e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.2240, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.6967, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.1086e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.3052, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0315, device='cuda:0', grad_fn=) cls_loss: tensor(1.7484e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.2354, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1126e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.9313e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.9173, device='cuda:0', grad_fn=) cls_loss: tensor(1.4106e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.5040e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.3965, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.4683e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.6757e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.9645e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.2254e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.5565e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) 2.5000000000000015e-05 changing lr epoch 20, time 329.35, cls_loss 0.0503 306 cls_loss: tensor(6.5843e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.1451, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0455, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.6815, device='cuda:0', grad_fn=) cls_loss: tensor(0.0551, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.1459e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.1526e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.3709e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.8710e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.7286e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0345, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.9008e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0035, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.3058, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-07, device='cuda:0', grad_fn=) cls_loss: tensor(8.0705e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.9804e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0533, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.1526e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.7552e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.4460, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.2120e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.1882e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9479, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.7611, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.5052, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.5169, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(4.8478e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0172, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.5775e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0769e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0689, device='cuda:0', grad_fn=) cls_loss: tensor(0.1539, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0063, device='cuda:0', grad_fn=) cls_loss: tensor(2.3047e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.8239e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(8.0466e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.7881e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0070, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.1668, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.8594, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0185, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.7716e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.1657e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.3644e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0033, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.4359, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.8516, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) 2.0610737385376352e-05 changing lr epoch 21, time 326.17, cls_loss 0.0363 306 cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.3242, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0928e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.8477e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0067, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.1696, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0070, device='cuda:0', grad_fn=) cls_loss: tensor(5.2790e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.9671e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.3988, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.4372e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.5656e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.4438e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0030, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0133, device='cuda:0', grad_fn=) cls_loss: tensor(5.3922e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.4750e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0403, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.1722, device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0402, device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.4506e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0729e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.2650e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0697, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.0531e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.4258, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.7354e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0229, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0669e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.6094e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0198, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9140e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.4199e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.4710, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.3447e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.7021e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0267, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0427, device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.6042e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.5299e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.0140e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.0135e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.4590, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.9604e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.6027e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0302, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.7552e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) 1.654346968205711e-05 changing lr epoch 22, time 327.71, cls_loss 0.0188 306 cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0048, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.7684e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.8358e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0395, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.5994e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.3709e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.7721, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0610e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.1143, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(1.8736e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.1377, device='cuda:0', grad_fn=) cls_loss: tensor(1.2418e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0530e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0987e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.8080e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.0201e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.7486e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.1526e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.5034e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0169, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.3578e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.7617e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.6436, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.0001e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.3710e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0106, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0032, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0183, device='cuda:0', grad_fn=) cls_loss: tensor(0.2277, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0476, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(1.6279, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0076, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.7219e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.4639e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.5034e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.5100e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0191, device='cuda:0', grad_fn=) 1.2842758726130304e-05 changing lr epoch 23, time 330.17, cls_loss 0.0121 306 cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.3710e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.1591e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.1657e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.7181, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.6292e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0038, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.3544e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.7618e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(2.9751, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.7618e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0083, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.2054e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0925, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.6570e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0184, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0038, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0459, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.4702e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.7213e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.0915e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1667, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.7552e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.4239e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.3910e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.2122e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.9671e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1126e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.8949e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.5565e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0108, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.1127e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.9008e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.2581, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.3910e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.1657e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.7156e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0034, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.5499e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0851, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0530e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.1414, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0103, device='cuda:0', grad_fn=) cls_loss: tensor(0.0036, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) 9.549150281252636e-06 changing lr epoch 24, time 328.04, cls_loss 0.0214 306 cls_loss: tensor(2.6623e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.2916e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.2121e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0782, device='cuda:0', grad_fn=) cls_loss: tensor(3.0200e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(7.3512e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.3844e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.5221e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.7552e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.4504e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.2718e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.0464e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.7418e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0353, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.6988, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0182, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.3776e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.6188, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.5565e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.1526e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.3944, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.9407e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.8122e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.1230, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.5924, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.5775e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.9407e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.2716e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0234, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.7552e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.0069e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.5775e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.9671e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.1075e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.2518e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.8577e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.9407e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.9539e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.6559, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.0464e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(3.5167e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.5697e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0034, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0065, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) 6.698729810778068e-06 changing lr epoch 25, time 329.09, cls_loss 0.0270 306 cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.3976e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.0267e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.2385e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.5193e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.3710e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.5433e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.6212e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.7188, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0060, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.2385e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9471e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.5882e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0160, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.6689e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.1259e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.1965, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.5697e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.7365e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.0001e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.2862e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0331e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.1094e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.0378, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0073, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0194, device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.8875e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.1564e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.4068e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0080, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.8279e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.5457e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0881, device='cuda:0', grad_fn=) cls_loss: tensor(9.3381e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.2386e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.6560e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.3491e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.0466e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0141, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0928e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.7021e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.3710e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.1270, device='cuda:0', grad_fn=) cls_loss: tensor(2.5193e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.1743, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.1339e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.5698e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9213e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) 4.322727117869953e-06 changing lr epoch 26, time 328.98, cls_loss 0.0144 306 cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.3233e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.4637e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.5431e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.1526e-07, device='cuda:0', grad_fn=) cls_loss: tensor(8.1062e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.7220e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.2318e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.8731, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.7186e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.2981e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.8014e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0196, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.7486e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(1.6272e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.5566e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0972, device='cuda:0', grad_fn=) cls_loss: tensor(5.5631e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.5332e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.3644e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.7155e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.7418e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.1870e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(2.7676e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.7683e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.9539e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.5763e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.9668, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.7760, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.5696e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3351e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.9010e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.3842e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.5279e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.4305e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.5055e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.5299e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.3645e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.5034e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(3.0597e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9014e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.8100e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.3710e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.3710e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(4.7684e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.5817e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) 2.447174185242324e-06 changing lr epoch 27, time 334.18, cls_loss 0.0122 306 cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0036, device='cuda:0', grad_fn=) cls_loss: tensor(9.5367e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0457, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.7442, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.9802e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.5385e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.0797e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.1345e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.1855e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0530e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(5.2452e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.1460e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.3454, device='cuda:0', grad_fn=) cls_loss: tensor(0.0033, device='cuda:0', grad_fn=) cls_loss: tensor(0.0070, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.3710e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.5367e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.5829e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.6175, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.5433e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.4756e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0815, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.7750e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.5631e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.6955e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.0134e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1126e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.7354e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0034, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.3289, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0030, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0045, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.1990e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.0663e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.9720, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(2.4418e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.2259e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.2875e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0101, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3908e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0755, device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.1592e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.5299e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.9341e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.2591, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.6149e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.5498e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.2716e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.4493e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.7551e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.1657e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) 1.092619963309716e-06 changing lr epoch 28, time 332.14, cls_loss 0.0115 306 cls_loss: tensor(3.3776e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.7353e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0063, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.9671e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.1921e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.2107e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9207e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.3709e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0447, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.2849e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.8477e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.0339e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0063, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0228, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.0598e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.8148e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.5299e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.1348, device='cuda:0', grad_fn=) cls_loss: tensor(3.1789e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(9.1394e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.7618e-07, device='cuda:0', grad_fn=) cls_loss: tensor(6.4174e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.1460e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(6.0002e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.8014e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.4571e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.7815e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0883, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(8.1460e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0081, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0049, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.9027e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.3862e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.1860, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.0247, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.8612e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0416, device='cuda:0', grad_fn=) cls_loss: tensor(7.9473e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.5914e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.1922e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.7829e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.5558e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.2040e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3709e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.7420e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.1723e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(2.7617e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(8.7023e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.4504e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0.0308, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.9736e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(3.8743e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(4.7684e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.9868e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(1.5895e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1126e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-08, device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) cls_loss: tensor(0., device='cuda:0', grad_fn=) 2.7390523158633003e-07 changing lr epoch 29, time 333.24, cls_loss 0.0053 ---------------------saving last model at epoch 29---------------------------------------------------- /home/yuqian_fu {'gpu': '0', 'svroot': '/data/work-gcp-europe-west4-a/yuqian_fu/datasets/SingleSourceDG/saved-PACS//art_painting/CA_multiple_16fa_v2_ep30_lr0.01_cosine_base0.01_bs6_lamCa_1_lamRe1_adt4_cls1_EW2_70_rmTrue_rnTrue_str5', 'source_domain': 'art_painting', 'svpath': '/data/work-gcp-europe-west4-a/yuqian_fu/datasets/SingleSourceDG/saved-PACS//art_painting/CA_multiple_16fa_v2_ep30_lr0.01_cosine_base0.01_bs6_lamCa_1_lamRe1_adt4_cls1_EW2_70_rmTrue_rnTrue_str5/art_painting_16factor_best_test_check.csv', 'factor_num': 16, 'epoch': 'best', 'stride': 5, 'eval_mapping': False, 'network': 'resnet18'} -------------------------------------loading pretrain weights---------------------------------- loading weight of best columns: ['art_painting', 'cartoon', 'photo', 'sketch'] /data/work-gcp-europe-west4-a/yuqian_fu/datasets/SingleSourceDG/data/PACS/art_painting_test.hdf5 torch.Size([2048, 3, 227, 227]) torch.Size([2048]) /data/work-gcp-europe-west4-a/yuqian_fu/datasets/SingleSourceDG/data/PACS/cartoon_test.hdf5 torch.Size([2344, 3, 227, 227]) torch.Size([2344]) /data/work-gcp-europe-west4-a/yuqian_fu/datasets/SingleSourceDG/data/PACS/photo_test.hdf5 torch.Size([1670, 3, 227, 227]) torch.Size([1670]) /data/work-gcp-europe-west4-a/yuqian_fu/datasets/SingleSourceDG/data/PACS/sketch_test.hdf5 torch.Size([3929, 3, 227, 227]) torch.Size([3929]) art_painting cartoon photo sketch Avg w/o do (original x) 18.505859 16.595563 11.317365 19.648766 15.853898