/home/yuqian_fu {'gpu': '0', 'data': 'mnist', 'ntr': None, 'translate': None, 'autoaug': 'CA_multiple', 'n': 3, 'stride': 3, 'factor_num': 14, 'epochs': 100, 'nbatch': 100, 'batchsize': 32, 'lr': 0.0001, 'lr_scheduler': 'Step', 'svroot': '/data/work-gcp-europe-west4-a/yuqian_fu/datasets/SingleSourceDG/saved-digit/CA_multiple_14fa_all_ep100_lr1e-4_lr_schedulerStep0.8_bs32_lamCa_1_lamRe_1_cls1_adt2_EW2_100_rmTrue_rnTrue_str3', 'clsadapt': True, 'lambda_causal': 1.0, 'lambda_re': 1.0, 'randm': True, 'randn': True, 'network': 'resnet18'} --------------------------CA_multiple-------------------------- ---------------------------14 factors----------------- randm: True randn: True n: 3 randm: False 100 cls_loss: tensor(2.3018, device='cuda:0', grad_fn=) cls_loss: tensor(2.2967, device='cuda:0', grad_fn=) cls_loss: tensor(2.2914, device='cuda:0', grad_fn=) cls_loss: tensor(2.2888, device='cuda:0', grad_fn=) cls_loss: tensor(2.3006, device='cuda:0', grad_fn=) cls_loss: tensor(2.2421, device='cuda:0', grad_fn=) cls_loss: tensor(2.2769, device='cuda:0', grad_fn=) cls_loss: tensor(2.2609, device='cuda:0', grad_fn=) cls_loss: tensor(2.2621, device='cuda:0', grad_fn=) cls_loss: tensor(2.2491, device='cuda:0', grad_fn=) cls_loss: tensor(2.2366, device='cuda:0', grad_fn=) cls_loss: tensor(2.2654, device='cuda:0', grad_fn=) cls_loss: tensor(2.1621, device='cuda:0', grad_fn=) cls_loss: tensor(2.2061, device='cuda:0', grad_fn=) cls_loss: tensor(2.1670, device='cuda:0', grad_fn=) cls_loss: tensor(2.2152, device='cuda:0', grad_fn=) cls_loss: tensor(2.1575, device='cuda:0', grad_fn=) cls_loss: tensor(2.1013, device='cuda:0', grad_fn=) cls_loss: tensor(2.1034, device='cuda:0', grad_fn=) cls_loss: tensor(2.0826, device='cuda:0', grad_fn=) cls_loss: tensor(2.0939, device='cuda:0', grad_fn=) cls_loss: tensor(2.0103, device='cuda:0', grad_fn=) cls_loss: tensor(1.9794, device='cuda:0', grad_fn=) cls_loss: tensor(1.8126, device='cuda:0', grad_fn=) cls_loss: tensor(1.9371, device='cuda:0', grad_fn=) cls_loss: tensor(1.8364, device='cuda:0', grad_fn=) cls_loss: tensor(1.9253, device='cuda:0', grad_fn=) cls_loss: tensor(1.7372, device='cuda:0', grad_fn=) cls_loss: tensor(1.7174, device='cuda:0', grad_fn=) cls_loss: tensor(1.5920, device='cuda:0', grad_fn=) cls_loss: tensor(1.5893, device='cuda:0', grad_fn=) cls_loss: tensor(1.4467, device='cuda:0', grad_fn=) cls_loss: tensor(1.5729, device='cuda:0', grad_fn=) cls_loss: tensor(1.3543, device='cuda:0', grad_fn=) cls_loss: tensor(1.4001, device='cuda:0', grad_fn=) cls_loss: tensor(1.1536, device='cuda:0', grad_fn=) cls_loss: tensor(1.0905, device='cuda:0', grad_fn=) cls_loss: tensor(1.3137, device='cuda:0', grad_fn=) cls_loss: tensor(1.0437, device='cuda:0', grad_fn=) cls_loss: tensor(1.0477, device='cuda:0', grad_fn=) cls_loss: tensor(1.2285, device='cuda:0', grad_fn=) cls_loss: tensor(1.0137, device='cuda:0', grad_fn=) cls_loss: tensor(0.9674, device='cuda:0', grad_fn=) cls_loss: tensor(0.8464, device='cuda:0', grad_fn=) cls_loss: tensor(0.9117, device='cuda:0', grad_fn=) cls_loss: tensor(1.0147, device='cuda:0', grad_fn=) cls_loss: tensor(0.9361, device='cuda:0', grad_fn=) cls_loss: tensor(0.8418, device='cuda:0', grad_fn=) cls_loss: tensor(0.6077, device='cuda:0', grad_fn=) cls_loss: tensor(1.0597, device='cuda:0', grad_fn=) cls_loss: tensor(0.8701, device='cuda:0', grad_fn=) cls_loss: tensor(0.7677, device='cuda:0', grad_fn=) cls_loss: tensor(0.6842, device='cuda:0', grad_fn=) cls_loss: tensor(0.5644, device='cuda:0', grad_fn=) cls_loss: tensor(0.5795, device='cuda:0', grad_fn=) cls_loss: tensor(0.5277, device='cuda:0', grad_fn=) cls_loss: tensor(0.7138, device='cuda:0', grad_fn=) cls_loss: tensor(0.8457, device='cuda:0', grad_fn=) cls_loss: tensor(0.6138, device='cuda:0', grad_fn=) cls_loss: tensor(0.4875, device='cuda:0', grad_fn=) cls_loss: tensor(0.5593, device='cuda:0', grad_fn=) cls_loss: tensor(0.3487, device='cuda:0', grad_fn=) cls_loss: tensor(0.4254, device='cuda:0', grad_fn=) cls_loss: tensor(0.5703, device='cuda:0', grad_fn=) cls_loss: tensor(0.4574, device='cuda:0', grad_fn=) cls_loss: tensor(0.3646, device='cuda:0', grad_fn=) cls_loss: tensor(0.6345, device='cuda:0', grad_fn=) cls_loss: tensor(0.3261, device='cuda:0', grad_fn=) cls_loss: tensor(0.6351, device='cuda:0', grad_fn=) cls_loss: tensor(0.8419, device='cuda:0', grad_fn=) cls_loss: tensor(0.3598, device='cuda:0', grad_fn=) cls_loss: tensor(0.4624, device='cuda:0', grad_fn=) cls_loss: tensor(0.5175, device='cuda:0', grad_fn=) cls_loss: tensor(0.4175, device='cuda:0', grad_fn=) cls_loss: tensor(0.6228, device='cuda:0', grad_fn=) cls_loss: tensor(0.6843, device='cuda:0', grad_fn=) cls_loss: tensor(0.3285, device='cuda:0', grad_fn=) cls_loss: tensor(0.6655, device='cuda:0', grad_fn=) cls_loss: tensor(0.4080, device='cuda:0', grad_fn=) cls_loss: tensor(0.2978, device='cuda:0', grad_fn=) cls_loss: tensor(0.5323, device='cuda:0', grad_fn=) cls_loss: tensor(0.6533, device='cuda:0', grad_fn=) cls_loss: tensor(0.2361, device='cuda:0', grad_fn=) cls_loss: tensor(0.6204, device='cuda:0', grad_fn=) cls_loss: tensor(0.6117, device='cuda:0', grad_fn=) cls_loss: tensor(0.6199, device='cuda:0', grad_fn=) cls_loss: tensor(0.4986, device='cuda:0', grad_fn=) cls_loss: tensor(0.5154, device='cuda:0', grad_fn=) cls_loss: tensor(0.4473, device='cuda:0', grad_fn=) cls_loss: tensor(0.5121, device='cuda:0', grad_fn=) cls_loss: tensor(0.5725, device='cuda:0', grad_fn=) cls_loss: tensor(0.4984, device='cuda:0', grad_fn=) cls_loss: tensor(0.5710, device='cuda:0', grad_fn=) cls_loss: tensor(0.3749, device='cuda:0', grad_fn=) cls_loss: tensor(0.3436, device='cuda:0', grad_fn=) cls_loss: tensor(0.2730, device='cuda:0', grad_fn=) cls_loss: tensor(0.3529, device='cuda:0', grad_fn=) cls_loss: tensor(0.5221, device='cuda:0', grad_fn=) cls_loss: tensor(0.4056, device='cuda:0', grad_fn=) cls_loss: tensor(0.4988, device='cuda:0', grad_fn=) 0.0001 changing lr ---------------------saving model at epoch 0---------------------------------------------------- epoch 0, time 182.72, cls_loss 1.1179 100 cls_loss: tensor(0.6869, device='cuda:0', grad_fn=) cls_loss: tensor(0.3712, device='cuda:0', grad_fn=) cls_loss: tensor(0.5848, device='cuda:0', grad_fn=) cls_loss: tensor(0.3256, device='cuda:0', grad_fn=) cls_loss: tensor(0.7101, device='cuda:0', grad_fn=) cls_loss: tensor(0.4262, device='cuda:0', grad_fn=) cls_loss: tensor(0.2651, device='cuda:0', grad_fn=) cls_loss: tensor(0.3633, device='cuda:0', grad_fn=) cls_loss: tensor(0.3626, device='cuda:0', grad_fn=) cls_loss: tensor(0.4211, device='cuda:0', grad_fn=) cls_loss: tensor(0.4060, device='cuda:0', grad_fn=) cls_loss: tensor(0.4459, device='cuda:0', grad_fn=) cls_loss: tensor(0.3686, device='cuda:0', grad_fn=) cls_loss: tensor(0.4626, device='cuda:0', grad_fn=) cls_loss: tensor(0.4834, device='cuda:0', grad_fn=) cls_loss: tensor(0.5015, device='cuda:0', grad_fn=) cls_loss: tensor(0.8156, device='cuda:0', grad_fn=) cls_loss: tensor(0.3784, device='cuda:0', grad_fn=) cls_loss: tensor(0.4146, device='cuda:0', grad_fn=) cls_loss: tensor(0.4766, device='cuda:0', grad_fn=) cls_loss: tensor(0.2340, device='cuda:0', grad_fn=) cls_loss: tensor(0.3481, device='cuda:0', grad_fn=) cls_loss: tensor(0.4575, device='cuda:0', grad_fn=) cls_loss: tensor(0.3822, device='cuda:0', grad_fn=) cls_loss: tensor(0.1820, device='cuda:0', grad_fn=) cls_loss: tensor(0.1624, device='cuda:0', grad_fn=) cls_loss: tensor(0.2620, device='cuda:0', grad_fn=) cls_loss: tensor(0.4822, device='cuda:0', grad_fn=) cls_loss: tensor(0.1930, device='cuda:0', grad_fn=) cls_loss: tensor(0.3789, device='cuda:0', grad_fn=) cls_loss: tensor(0.4260, device='cuda:0', grad_fn=) cls_loss: tensor(0.3001, device='cuda:0', grad_fn=) cls_loss: tensor(0.2250, device='cuda:0', grad_fn=) cls_loss: tensor(0.4114, device='cuda:0', grad_fn=) cls_loss: tensor(0.2384, device='cuda:0', grad_fn=) cls_loss: tensor(0.3781, device='cuda:0', grad_fn=) cls_loss: tensor(0.3776, device='cuda:0', grad_fn=) cls_loss: tensor(0.5152, device='cuda:0', grad_fn=) cls_loss: tensor(0.2852, device='cuda:0', grad_fn=) cls_loss: tensor(0.4645, device='cuda:0', grad_fn=) cls_loss: tensor(0.2827, device='cuda:0', grad_fn=) cls_loss: tensor(0.3391, device='cuda:0', grad_fn=) cls_loss: tensor(0.3140, device='cuda:0', grad_fn=) cls_loss: tensor(0.2310, device='cuda:0', grad_fn=) cls_loss: tensor(0.3194, device='cuda:0', grad_fn=) cls_loss: tensor(0.2205, device='cuda:0', grad_fn=) cls_loss: tensor(0.4111, device='cuda:0', grad_fn=) cls_loss: tensor(0.3685, device='cuda:0', grad_fn=) cls_loss: tensor(0.2552, device='cuda:0', grad_fn=) cls_loss: tensor(0.1288, device='cuda:0', grad_fn=) cls_loss: tensor(0.2085, device='cuda:0', grad_fn=) cls_loss: tensor(0.4238, device='cuda:0', grad_fn=) cls_loss: tensor(0.4571, device='cuda:0', grad_fn=) cls_loss: tensor(0.3442, device='cuda:0', grad_fn=) cls_loss: tensor(0.4611, device='cuda:0', grad_fn=) cls_loss: tensor(0.6635, device='cuda:0', grad_fn=) cls_loss: tensor(0.2551, device='cuda:0', grad_fn=) cls_loss: tensor(0.5264, device='cuda:0', grad_fn=) cls_loss: tensor(0.1205, device='cuda:0', grad_fn=) cls_loss: tensor(0.1469, device='cuda:0', grad_fn=) cls_loss: tensor(0.1476, device='cuda:0', grad_fn=) cls_loss: tensor(0.1771, device='cuda:0', grad_fn=) cls_loss: tensor(0.2374, device='cuda:0', grad_fn=) cls_loss: tensor(0.3235, device='cuda:0', grad_fn=) cls_loss: tensor(0.1842, device='cuda:0', grad_fn=) cls_loss: tensor(0.2830, device='cuda:0', grad_fn=) cls_loss: tensor(0.2061, device='cuda:0', grad_fn=) cls_loss: tensor(0.3261, device='cuda:0', grad_fn=) cls_loss: tensor(0.5239, device='cuda:0', grad_fn=) cls_loss: tensor(0.2846, device='cuda:0', grad_fn=) cls_loss: tensor(0.5557, device='cuda:0', grad_fn=) cls_loss: tensor(0.2296, device='cuda:0', grad_fn=) cls_loss: tensor(0.1787, device='cuda:0', grad_fn=) cls_loss: tensor(0.1321, device='cuda:0', grad_fn=) cls_loss: tensor(0.2944, device='cuda:0', grad_fn=) cls_loss: tensor(0.3349, device='cuda:0', grad_fn=) cls_loss: tensor(0.3299, device='cuda:0', grad_fn=) cls_loss: tensor(0.1269, device='cuda:0', grad_fn=) cls_loss: tensor(0.2107, device='cuda:0', grad_fn=) cls_loss: tensor(0.4164, device='cuda:0', grad_fn=) cls_loss: tensor(0.6172, device='cuda:0', grad_fn=) cls_loss: tensor(0.1464, device='cuda:0', grad_fn=) cls_loss: tensor(0.1707, device='cuda:0', grad_fn=) cls_loss: tensor(0.2850, device='cuda:0', grad_fn=) cls_loss: tensor(0.3688, device='cuda:0', grad_fn=) cls_loss: tensor(0.4021, device='cuda:0', grad_fn=) cls_loss: tensor(0.2294, device='cuda:0', grad_fn=) cls_loss: tensor(0.3534, device='cuda:0', grad_fn=) cls_loss: tensor(0.2481, device='cuda:0', grad_fn=) cls_loss: tensor(0.3390, device='cuda:0', grad_fn=) cls_loss: tensor(0.1398, device='cuda:0', grad_fn=) cls_loss: tensor(0.1629, device='cuda:0', grad_fn=) cls_loss: tensor(0.5124, device='cuda:0', grad_fn=) cls_loss: tensor(0.1330, device='cuda:0', grad_fn=) cls_loss: tensor(0.3258, device='cuda:0', grad_fn=) cls_loss: tensor(0.2945, device='cuda:0', grad_fn=) cls_loss: tensor(0.0680, device='cuda:0', grad_fn=) cls_loss: tensor(0.3429, device='cuda:0', grad_fn=) cls_loss: tensor(0.1665, device='cuda:0', grad_fn=) cls_loss: tensor(0.1723, device='cuda:0', grad_fn=) 0.0001 changing lr ---------------------saving model at epoch 1---------------------------------------------------- epoch 1, time 181.61, cls_loss 0.3343 100 cls_loss: tensor(0.1039, device='cuda:0', grad_fn=) cls_loss: tensor(0.3162, device='cuda:0', grad_fn=) cls_loss: tensor(0.1866, device='cuda:0', grad_fn=) cls_loss: tensor(0.1647, device='cuda:0', grad_fn=) cls_loss: tensor(0.1711, device='cuda:0', grad_fn=) cls_loss: tensor(0.2611, device='cuda:0', grad_fn=) cls_loss: tensor(0.3073, device='cuda:0', grad_fn=) cls_loss: tensor(0.1203, device='cuda:0', grad_fn=) cls_loss: tensor(0.2870, device='cuda:0', grad_fn=) cls_loss: tensor(0.1670, device='cuda:0', grad_fn=) cls_loss: tensor(0.3047, device='cuda:0', grad_fn=) cls_loss: tensor(0.2275, device='cuda:0', grad_fn=) cls_loss: tensor(0.5310, device='cuda:0', grad_fn=) cls_loss: tensor(0.2183, device='cuda:0', grad_fn=) cls_loss: tensor(0.4504, device='cuda:0', grad_fn=) cls_loss: tensor(0.4902, device='cuda:0', grad_fn=) cls_loss: tensor(0.3023, device='cuda:0', grad_fn=) cls_loss: tensor(0.2660, device='cuda:0', grad_fn=) cls_loss: tensor(0.0481, device='cuda:0', grad_fn=) cls_loss: tensor(0.1920, device='cuda:0', grad_fn=) cls_loss: tensor(0.1389, device='cuda:0', grad_fn=) cls_loss: tensor(0.1213, device='cuda:0', grad_fn=) cls_loss: tensor(0.1771, device='cuda:0', grad_fn=) cls_loss: tensor(0.2130, device='cuda:0', grad_fn=) cls_loss: tensor(0.2634, device='cuda:0', grad_fn=) cls_loss: tensor(0.1608, device='cuda:0', grad_fn=) cls_loss: tensor(0.1108, device='cuda:0', grad_fn=) cls_loss: tensor(0.3517, device='cuda:0', grad_fn=) cls_loss: tensor(0.2239, device='cuda:0', grad_fn=) cls_loss: tensor(0.2674, device='cuda:0', grad_fn=) cls_loss: tensor(0.3311, device='cuda:0', grad_fn=) cls_loss: tensor(0.1465, device='cuda:0', grad_fn=) cls_loss: tensor(0.0860, device='cuda:0', grad_fn=) cls_loss: tensor(0.1758, device='cuda:0', grad_fn=) cls_loss: tensor(0.3268, device='cuda:0', grad_fn=) cls_loss: tensor(0.1861, device='cuda:0', grad_fn=) cls_loss: tensor(0.1798, device='cuda:0', grad_fn=) cls_loss: tensor(0.2191, device='cuda:0', grad_fn=) cls_loss: tensor(0.4590, device='cuda:0', grad_fn=) cls_loss: tensor(0.1614, device='cuda:0', grad_fn=) cls_loss: tensor(0.2103, device='cuda:0', grad_fn=) cls_loss: tensor(0.1354, device='cuda:0', grad_fn=) cls_loss: tensor(0.1177, device='cuda:0', grad_fn=) cls_loss: tensor(0.0761, device='cuda:0', grad_fn=) cls_loss: tensor(0.1868, device='cuda:0', grad_fn=) cls_loss: tensor(0.1273, device='cuda:0', grad_fn=) cls_loss: tensor(0.3154, device='cuda:0', grad_fn=) cls_loss: tensor(0.1834, device='cuda:0', grad_fn=) cls_loss: tensor(0.2937, device='cuda:0', grad_fn=) cls_loss: tensor(0.1408, device='cuda:0', grad_fn=) cls_loss: tensor(0.1858, device='cuda:0', grad_fn=) cls_loss: tensor(0.0781, device='cuda:0', grad_fn=) cls_loss: tensor(0.2233, device='cuda:0', grad_fn=) cls_loss: tensor(0.1573, device='cuda:0', grad_fn=) cls_loss: tensor(0.1815, device='cuda:0', grad_fn=) cls_loss: tensor(0.2661, device='cuda:0', grad_fn=) cls_loss: tensor(0.4841, device='cuda:0', grad_fn=) cls_loss: tensor(0.1589, device='cuda:0', grad_fn=) cls_loss: tensor(0.3504, device='cuda:0', grad_fn=) cls_loss: tensor(0.0743, device='cuda:0', grad_fn=) cls_loss: tensor(0.3965, device='cuda:0', grad_fn=) cls_loss: tensor(0.1111, device='cuda:0', grad_fn=) cls_loss: tensor(0.1006, device='cuda:0', grad_fn=) cls_loss: tensor(0.4098, device='cuda:0', grad_fn=) cls_loss: tensor(0.3084, device='cuda:0', grad_fn=) cls_loss: tensor(0.2935, device='cuda:0', grad_fn=) cls_loss: tensor(0.2041, device='cuda:0', grad_fn=) cls_loss: tensor(0.1508, device='cuda:0', grad_fn=) cls_loss: tensor(0.3373, device='cuda:0', grad_fn=) cls_loss: tensor(0.1398, device='cuda:0', grad_fn=) cls_loss: tensor(0.2156, device='cuda:0', grad_fn=) cls_loss: tensor(0.1845, device='cuda:0', grad_fn=) cls_loss: tensor(0.2842, device='cuda:0', grad_fn=) cls_loss: tensor(0.1436, device='cuda:0', grad_fn=) cls_loss: tensor(0.0574, device='cuda:0', grad_fn=) cls_loss: tensor(0.1657, device='cuda:0', grad_fn=) cls_loss: tensor(0.1268, device='cuda:0', grad_fn=) cls_loss: tensor(0.1298, device='cuda:0', grad_fn=) cls_loss: tensor(0.1032, device='cuda:0', grad_fn=) cls_loss: tensor(0.0864, device='cuda:0', grad_fn=) cls_loss: tensor(0.5140, device='cuda:0', grad_fn=) cls_loss: tensor(0.0932, device='cuda:0', grad_fn=) cls_loss: tensor(0.2167, device='cuda:0', grad_fn=) cls_loss: tensor(0.1425, device='cuda:0', grad_fn=) cls_loss: tensor(0.2097, device='cuda:0', grad_fn=) cls_loss: tensor(0.1434, device='cuda:0', grad_fn=) cls_loss: tensor(0.0417, device='cuda:0', grad_fn=) cls_loss: tensor(0.3224, device='cuda:0', grad_fn=) cls_loss: tensor(0.3030, device='cuda:0', grad_fn=) cls_loss: tensor(0.1312, device='cuda:0', grad_fn=) cls_loss: tensor(0.3364, device='cuda:0', grad_fn=) cls_loss: tensor(0.1071, device='cuda:0', grad_fn=) cls_loss: tensor(0.1733, device='cuda:0', grad_fn=) cls_loss: tensor(0.5018, device='cuda:0', grad_fn=) cls_loss: tensor(0.1199, device='cuda:0', grad_fn=) cls_loss: tensor(0.2147, device='cuda:0', grad_fn=) cls_loss: tensor(0.2736, device='cuda:0', grad_fn=) cls_loss: tensor(0.6004, device='cuda:0', grad_fn=) cls_loss: tensor(0.4873, device='cuda:0', grad_fn=) cls_loss: tensor(0.0675, device='cuda:0', grad_fn=) 0.0001 changing lr ---------------------saving model at epoch 2---------------------------------------------------- epoch 2, time 181.39, cls_loss 0.2231 100 cls_loss: tensor(0.0775, device='cuda:0', grad_fn=) cls_loss: tensor(0.1717, device='cuda:0', grad_fn=) cls_loss: tensor(0.1921, device='cuda:0', grad_fn=) cls_loss: tensor(0.3511, device='cuda:0', grad_fn=) cls_loss: tensor(0.1005, device='cuda:0', grad_fn=) cls_loss: tensor(0.4275, device='cuda:0', grad_fn=) cls_loss: tensor(0.3294, device='cuda:0', grad_fn=) cls_loss: tensor(0.1252, device='cuda:0', grad_fn=) cls_loss: tensor(0.1354, device='cuda:0', grad_fn=) cls_loss: tensor(0.0542, device='cuda:0', grad_fn=) cls_loss: tensor(0.2659, device='cuda:0', grad_fn=) cls_loss: tensor(0.2071, device='cuda:0', grad_fn=) cls_loss: tensor(0.2721, device='cuda:0', grad_fn=) cls_loss: tensor(0.0854, device='cuda:0', grad_fn=) cls_loss: tensor(0.0428, device='cuda:0', grad_fn=) cls_loss: tensor(0.1451, device='cuda:0', grad_fn=) cls_loss: tensor(0.2318, device='cuda:0', grad_fn=) cls_loss: tensor(0.1840, device='cuda:0', grad_fn=) cls_loss: tensor(0.0340, device='cuda:0', grad_fn=) cls_loss: tensor(0.0443, device='cuda:0', grad_fn=) cls_loss: tensor(0.0615, device='cuda:0', grad_fn=) cls_loss: tensor(0.0836, device='cuda:0', grad_fn=) cls_loss: tensor(0.2699, device='cuda:0', grad_fn=) cls_loss: tensor(0.4841, device='cuda:0', grad_fn=) cls_loss: tensor(0.1923, device='cuda:0', grad_fn=) cls_loss: tensor(0.0228, device='cuda:0', grad_fn=) cls_loss: tensor(0.1558, device='cuda:0', grad_fn=) cls_loss: tensor(0.4641, device='cuda:0', grad_fn=) cls_loss: tensor(0.0728, device='cuda:0', grad_fn=) cls_loss: tensor(0.2402, device='cuda:0', grad_fn=) cls_loss: tensor(0.0659, device='cuda:0', grad_fn=) cls_loss: tensor(0.2287, device='cuda:0', grad_fn=) cls_loss: tensor(0.4113, device='cuda:0', grad_fn=) cls_loss: tensor(0.1452, device='cuda:0', grad_fn=) cls_loss: tensor(0.1573, device='cuda:0', grad_fn=) cls_loss: tensor(0.3160, device='cuda:0', grad_fn=) cls_loss: tensor(0.2446, device='cuda:0', grad_fn=) cls_loss: tensor(0.1034, device='cuda:0', grad_fn=) cls_loss: tensor(0.3072, device='cuda:0', grad_fn=) cls_loss: tensor(0.1324, device='cuda:0', grad_fn=) cls_loss: tensor(0.1271, device='cuda:0', grad_fn=) cls_loss: tensor(0.2528, device='cuda:0', grad_fn=) cls_loss: tensor(0.1448, device='cuda:0', grad_fn=) cls_loss: tensor(0.1158, device='cuda:0', grad_fn=) cls_loss: tensor(0.0276, device='cuda:0', grad_fn=) cls_loss: tensor(0.1161, device='cuda:0', grad_fn=) cls_loss: tensor(0.0826, device='cuda:0', grad_fn=) cls_loss: tensor(0.0901, device='cuda:0', grad_fn=) cls_loss: tensor(0.1482, device='cuda:0', grad_fn=) cls_loss: tensor(0.0688, device='cuda:0', grad_fn=) cls_loss: tensor(0.1219, device='cuda:0', grad_fn=) cls_loss: tensor(0.1026, device='cuda:0', grad_fn=) cls_loss: tensor(0.1533, device='cuda:0', grad_fn=) cls_loss: tensor(0.1983, device='cuda:0', grad_fn=) cls_loss: tensor(0.3739, device='cuda:0', grad_fn=) cls_loss: tensor(0.4433, device='cuda:0', grad_fn=) cls_loss: tensor(0.2955, device='cuda:0', grad_fn=) cls_loss: tensor(0.1904, device='cuda:0', grad_fn=) cls_loss: tensor(0.2407, device='cuda:0', grad_fn=) cls_loss: tensor(0.1007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0485, device='cuda:0', grad_fn=) cls_loss: tensor(0.4687, device='cuda:0', grad_fn=) cls_loss: tensor(0.0519, device='cuda:0', grad_fn=) cls_loss: tensor(0.2987, device='cuda:0', grad_fn=) cls_loss: tensor(0.1293, device='cuda:0', grad_fn=) cls_loss: tensor(0.2186, device='cuda:0', grad_fn=) cls_loss: tensor(0.0794, device='cuda:0', grad_fn=) cls_loss: tensor(0.3035, device='cuda:0', grad_fn=) cls_loss: tensor(0.1838, device='cuda:0', grad_fn=) cls_loss: tensor(0.0458, device='cuda:0', grad_fn=) cls_loss: tensor(0.1496, device='cuda:0', grad_fn=) cls_loss: tensor(0.1490, device='cuda:0', grad_fn=) cls_loss: tensor(0.1388, device='cuda:0', grad_fn=) cls_loss: tensor(0.1267, device='cuda:0', grad_fn=) cls_loss: tensor(0.2654, device='cuda:0', grad_fn=) cls_loss: tensor(0.0998, device='cuda:0', grad_fn=) cls_loss: tensor(0.2536, device='cuda:0', grad_fn=) cls_loss: tensor(0.1278, device='cuda:0', grad_fn=) cls_loss: tensor(0.2191, device='cuda:0', grad_fn=) cls_loss: tensor(0.2527, device='cuda:0', grad_fn=) cls_loss: tensor(0.1393, device='cuda:0', grad_fn=) cls_loss: tensor(0.1610, device='cuda:0', grad_fn=) cls_loss: tensor(0.1093, device='cuda:0', grad_fn=) cls_loss: tensor(0.4826, device='cuda:0', grad_fn=) cls_loss: tensor(0.1251, device='cuda:0', grad_fn=) cls_loss: tensor(0.2966, device='cuda:0', grad_fn=) cls_loss: tensor(0.1281, device='cuda:0', grad_fn=) cls_loss: tensor(0.2387, device='cuda:0', grad_fn=) cls_loss: tensor(0.0887, device='cuda:0', grad_fn=) cls_loss: tensor(0.3301, device='cuda:0', grad_fn=) cls_loss: tensor(0.1823, device='cuda:0', grad_fn=) cls_loss: tensor(0.2160, device='cuda:0', grad_fn=) cls_loss: tensor(0.2347, device='cuda:0', grad_fn=) cls_loss: tensor(0.0583, device='cuda:0', grad_fn=) cls_loss: tensor(0.1190, device='cuda:0', grad_fn=) cls_loss: tensor(0.1779, device='cuda:0', grad_fn=) cls_loss: tensor(0.0811, device='cuda:0', grad_fn=) cls_loss: tensor(0.3870, device='cuda:0', grad_fn=) cls_loss: tensor(0.2327, device='cuda:0', grad_fn=) cls_loss: tensor(0.0353, device='cuda:0', grad_fn=) 0.0001 changing lr ---------------------saving model at epoch 3---------------------------------------------------- epoch 3, time 180.81, cls_loss 0.1847 100 cls_loss: tensor(0.1245, device='cuda:0', grad_fn=) cls_loss: tensor(0.2026, device='cuda:0', grad_fn=) cls_loss: tensor(0.1447, device='cuda:0', grad_fn=) cls_loss: tensor(0.0685, device='cuda:0', grad_fn=) cls_loss: tensor(0.0938, device='cuda:0', grad_fn=) cls_loss: tensor(0.1531, device='cuda:0', grad_fn=) cls_loss: tensor(0.1261, device='cuda:0', grad_fn=) cls_loss: tensor(0.1590, device='cuda:0', grad_fn=) cls_loss: tensor(0.1331, device='cuda:0', grad_fn=) cls_loss: tensor(0.1793, device='cuda:0', grad_fn=) cls_loss: tensor(0.2451, device='cuda:0', grad_fn=) cls_loss: tensor(0.0803, device='cuda:0', grad_fn=) cls_loss: tensor(0.1530, device='cuda:0', grad_fn=) cls_loss: tensor(0.0241, device='cuda:0', grad_fn=) cls_loss: tensor(0.1282, device='cuda:0', grad_fn=) cls_loss: tensor(0.0531, device='cuda:0', grad_fn=) cls_loss: tensor(0.0593, device='cuda:0', grad_fn=) cls_loss: tensor(0.1096, device='cuda:0', grad_fn=) cls_loss: tensor(0.1086, device='cuda:0', grad_fn=) cls_loss: tensor(0.1035, device='cuda:0', grad_fn=) cls_loss: tensor(0.0948, device='cuda:0', grad_fn=) cls_loss: tensor(0.0399, device='cuda:0', grad_fn=) cls_loss: tensor(0.0917, device='cuda:0', grad_fn=) cls_loss: tensor(0.0345, device='cuda:0', grad_fn=) cls_loss: tensor(0.0452, device='cuda:0', grad_fn=) cls_loss: tensor(0.0264, device='cuda:0', grad_fn=) cls_loss: tensor(0.0562, device='cuda:0', grad_fn=) cls_loss: tensor(0.1345, device='cuda:0', grad_fn=) cls_loss: tensor(0.5400, device='cuda:0', grad_fn=) cls_loss: tensor(0.0913, device='cuda:0', grad_fn=) cls_loss: tensor(0.2909, device='cuda:0', grad_fn=) cls_loss: tensor(0.0685, device='cuda:0', grad_fn=) cls_loss: tensor(0.2381, device='cuda:0', grad_fn=) cls_loss: tensor(0.1020, device='cuda:0', grad_fn=) cls_loss: tensor(0.1366, device='cuda:0', grad_fn=) cls_loss: tensor(0.1040, device='cuda:0', grad_fn=) cls_loss: tensor(0.3085, device='cuda:0', grad_fn=) cls_loss: tensor(0.3585, device='cuda:0', grad_fn=) cls_loss: tensor(0.0865, device='cuda:0', grad_fn=) cls_loss: tensor(0.1840, device='cuda:0', grad_fn=) cls_loss: tensor(0.1097, device='cuda:0', grad_fn=) cls_loss: tensor(0.6310, device='cuda:0', grad_fn=) cls_loss: tensor(0.3383, device='cuda:0', grad_fn=) cls_loss: tensor(0.0836, device='cuda:0', grad_fn=) cls_loss: tensor(0.0597, device='cuda:0', grad_fn=) cls_loss: tensor(0.0417, device='cuda:0', grad_fn=) cls_loss: tensor(0.0404, device='cuda:0', grad_fn=) cls_loss: tensor(0.1330, device='cuda:0', grad_fn=) cls_loss: tensor(0.3345, device='cuda:0', grad_fn=) cls_loss: tensor(0.2515, device='cuda:0', grad_fn=) cls_loss: tensor(0.3030, device='cuda:0', grad_fn=) cls_loss: tensor(0.0683, device='cuda:0', grad_fn=) cls_loss: tensor(0.0389, device='cuda:0', grad_fn=) cls_loss: tensor(0.1089, device='cuda:0', grad_fn=) cls_loss: tensor(0.0857, device='cuda:0', grad_fn=) cls_loss: tensor(0.1412, device='cuda:0', grad_fn=) cls_loss: tensor(0.1729, device='cuda:0', grad_fn=) cls_loss: tensor(0.0860, device='cuda:0', grad_fn=) cls_loss: tensor(0.0456, device='cuda:0', grad_fn=) cls_loss: tensor(0.0738, device='cuda:0', grad_fn=) cls_loss: tensor(0.0472, device='cuda:0', grad_fn=) cls_loss: tensor(0.1172, device='cuda:0', grad_fn=) cls_loss: tensor(0.3650, device='cuda:0', grad_fn=) cls_loss: tensor(0.0551, device='cuda:0', grad_fn=) cls_loss: tensor(0.0368, device='cuda:0', grad_fn=) cls_loss: tensor(0.0974, device='cuda:0', grad_fn=) cls_loss: tensor(0.0792, device='cuda:0', grad_fn=) cls_loss: tensor(0.0614, device='cuda:0', grad_fn=) cls_loss: tensor(0.3708, device='cuda:0', grad_fn=) cls_loss: tensor(0.0267, device='cuda:0', grad_fn=) cls_loss: tensor(0.0280, device='cuda:0', grad_fn=) cls_loss: tensor(0.3569, device='cuda:0', grad_fn=) cls_loss: tensor(0.2385, device='cuda:0', grad_fn=) cls_loss: tensor(0.0375, device='cuda:0', grad_fn=) cls_loss: tensor(0.2219, device='cuda:0', grad_fn=) cls_loss: tensor(0.1149, device='cuda:0', grad_fn=) cls_loss: tensor(0.0878, device='cuda:0', grad_fn=) cls_loss: tensor(0.3464, device='cuda:0', grad_fn=) cls_loss: tensor(0.0921, device='cuda:0', grad_fn=) cls_loss: tensor(0.2182, device='cuda:0', grad_fn=) cls_loss: tensor(0.1553, device='cuda:0', grad_fn=) cls_loss: tensor(0.0386, device='cuda:0', grad_fn=) cls_loss: tensor(0.0666, device='cuda:0', grad_fn=) cls_loss: tensor(0.1333, device='cuda:0', grad_fn=) cls_loss: tensor(0.3014, device='cuda:0', grad_fn=) cls_loss: tensor(0.2113, device='cuda:0', grad_fn=) cls_loss: tensor(0.1387, device='cuda:0', grad_fn=) cls_loss: tensor(0.1124, device='cuda:0', grad_fn=) cls_loss: tensor(0.1646, device='cuda:0', grad_fn=) cls_loss: tensor(0.1073, device='cuda:0', grad_fn=) cls_loss: tensor(0.2640, device='cuda:0', grad_fn=) cls_loss: tensor(0.0167, device='cuda:0', grad_fn=) cls_loss: tensor(0.2150, device='cuda:0', grad_fn=) cls_loss: tensor(0.1058, device='cuda:0', grad_fn=) cls_loss: tensor(0.0485, device='cuda:0', grad_fn=) cls_loss: tensor(0.0817, device='cuda:0', grad_fn=) cls_loss: tensor(0.1662, device='cuda:0', grad_fn=) cls_loss: tensor(0.1507, device='cuda:0', grad_fn=) cls_loss: tensor(0.0683, device='cuda:0', grad_fn=) cls_loss: tensor(0.0415, device='cuda:0', grad_fn=) 0.0001 changing lr ---------------------saving model at epoch 4---------------------------------------------------- epoch 4, time 180.77, cls_loss 0.1422 100 cls_loss: tensor(0.0446, device='cuda:0', grad_fn=) cls_loss: tensor(0.0560, device='cuda:0', grad_fn=) cls_loss: tensor(0.1138, device='cuda:0', grad_fn=) cls_loss: tensor(0.0487, device='cuda:0', grad_fn=) cls_loss: tensor(0.0993, device='cuda:0', grad_fn=) cls_loss: tensor(0.2354, device='cuda:0', grad_fn=) cls_loss: tensor(0.1382, device='cuda:0', grad_fn=) cls_loss: tensor(0.0566, device='cuda:0', grad_fn=) cls_loss: tensor(0.0488, device='cuda:0', grad_fn=) cls_loss: tensor(0.0702, device='cuda:0', grad_fn=) cls_loss: tensor(0.0294, device='cuda:0', grad_fn=) cls_loss: tensor(0.0672, device='cuda:0', grad_fn=) cls_loss: tensor(0.2007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0829, device='cuda:0', grad_fn=) cls_loss: tensor(0.1332, device='cuda:0', grad_fn=) cls_loss: tensor(0.1258, device='cuda:0', grad_fn=) cls_loss: tensor(0.1304, device='cuda:0', grad_fn=) cls_loss: tensor(0.0410, device='cuda:0', grad_fn=) cls_loss: tensor(0.0878, device='cuda:0', grad_fn=) cls_loss: tensor(0.0770, device='cuda:0', grad_fn=) cls_loss: tensor(0.2492, device='cuda:0', grad_fn=) cls_loss: tensor(0.0195, device='cuda:0', grad_fn=) cls_loss: tensor(0.0271, device='cuda:0', grad_fn=) cls_loss: tensor(0.0582, device='cuda:0', grad_fn=) cls_loss: tensor(0.0208, device='cuda:0', grad_fn=) cls_loss: tensor(0.0608, device='cuda:0', grad_fn=) cls_loss: tensor(0.0283, device='cuda:0', grad_fn=) cls_loss: tensor(0.1606, device='cuda:0', grad_fn=) cls_loss: tensor(0.0485, device='cuda:0', grad_fn=) cls_loss: tensor(0.0745, device='cuda:0', grad_fn=) cls_loss: tensor(0.0798, device='cuda:0', grad_fn=) cls_loss: tensor(0.2605, device='cuda:0', grad_fn=) cls_loss: tensor(0.1547, device='cuda:0', grad_fn=) cls_loss: tensor(0.0431, device='cuda:0', grad_fn=) cls_loss: tensor(0.0178, device='cuda:0', grad_fn=) cls_loss: tensor(0.0215, device='cuda:0', grad_fn=) cls_loss: tensor(0.0256, device='cuda:0', grad_fn=) cls_loss: tensor(0.2287, device='cuda:0', grad_fn=) cls_loss: tensor(0.0985, device='cuda:0', grad_fn=) cls_loss: tensor(0.1166, device='cuda:0', grad_fn=) cls_loss: tensor(0.0440, device='cuda:0', grad_fn=) cls_loss: tensor(0.2130, device='cuda:0', grad_fn=) cls_loss: tensor(0.0158, device='cuda:0', grad_fn=) cls_loss: tensor(0.2162, device='cuda:0', grad_fn=) cls_loss: tensor(0.0935, device='cuda:0', grad_fn=) cls_loss: tensor(0.2006, device='cuda:0', grad_fn=) cls_loss: tensor(0.1556, device='cuda:0', grad_fn=) cls_loss: tensor(0.1104, device='cuda:0', grad_fn=) cls_loss: tensor(0.0839, device='cuda:0', grad_fn=) cls_loss: tensor(0.1185, device='cuda:0', grad_fn=) cls_loss: tensor(0.2769, device='cuda:0', grad_fn=) cls_loss: tensor(0.6162, device='cuda:0', grad_fn=) cls_loss: tensor(0.3589, device='cuda:0', grad_fn=) cls_loss: tensor(0.0420, device='cuda:0', grad_fn=) cls_loss: tensor(0.1019, device='cuda:0', grad_fn=) cls_loss: tensor(0.1402, device='cuda:0', grad_fn=) cls_loss: tensor(0.0989, device='cuda:0', grad_fn=) cls_loss: tensor(0.1065, device='cuda:0', grad_fn=) cls_loss: tensor(0.0353, device='cuda:0', grad_fn=) cls_loss: tensor(0.0188, device='cuda:0', grad_fn=) cls_loss: tensor(0.0153, device='cuda:0', grad_fn=) cls_loss: tensor(0.0234, device='cuda:0', grad_fn=) cls_loss: tensor(0.0470, device='cuda:0', grad_fn=) cls_loss: tensor(0.2104, device='cuda:0', grad_fn=) cls_loss: tensor(0.1707, device='cuda:0', grad_fn=) cls_loss: tensor(0.5426, device='cuda:0', grad_fn=) cls_loss: tensor(0.1869, device='cuda:0', grad_fn=) cls_loss: tensor(0.1060, device='cuda:0', grad_fn=) cls_loss: tensor(0.0360, device='cuda:0', grad_fn=) cls_loss: tensor(0.1494, device='cuda:0', grad_fn=) cls_loss: tensor(0.3134, device='cuda:0', grad_fn=) cls_loss: tensor(0.0835, device='cuda:0', grad_fn=) cls_loss: tensor(0.0240, device='cuda:0', grad_fn=) cls_loss: tensor(0.0302, device='cuda:0', grad_fn=) cls_loss: tensor(0.1334, device='cuda:0', grad_fn=) cls_loss: tensor(0.3250, device='cuda:0', grad_fn=) cls_loss: tensor(0.1328, device='cuda:0', grad_fn=) cls_loss: tensor(0.0459, device='cuda:0', grad_fn=) cls_loss: tensor(0.1028, device='cuda:0', grad_fn=) cls_loss: tensor(0.0230, device='cuda:0', grad_fn=) cls_loss: tensor(0.1760, device='cuda:0', grad_fn=) cls_loss: tensor(0.0536, device='cuda:0', grad_fn=) cls_loss: tensor(0.2251, device='cuda:0', grad_fn=) cls_loss: tensor(0.2027, device='cuda:0', grad_fn=) cls_loss: tensor(0.0367, device='cuda:0', grad_fn=) cls_loss: tensor(0.0680, device='cuda:0', grad_fn=) cls_loss: tensor(0.0756, device='cuda:0', grad_fn=) cls_loss: tensor(0.3796, device='cuda:0', grad_fn=) cls_loss: tensor(0.1119, device='cuda:0', grad_fn=) cls_loss: tensor(0.3358, device='cuda:0', grad_fn=) cls_loss: tensor(0.1883, device='cuda:0', grad_fn=) cls_loss: tensor(0.1508, device='cuda:0', grad_fn=) cls_loss: tensor(0.1343, device='cuda:0', grad_fn=) cls_loss: tensor(0.0263, device='cuda:0', grad_fn=) cls_loss: tensor(0.0318, device='cuda:0', grad_fn=) cls_loss: tensor(0.1210, device='cuda:0', grad_fn=) cls_loss: tensor(0.1324, device='cuda:0', grad_fn=) cls_loss: tensor(0.1162, device='cuda:0', grad_fn=) cls_loss: tensor(0.0530, device='cuda:0', grad_fn=) cls_loss: tensor(0.1589, device='cuda:0', grad_fn=) 0.0001 changing lr epoch 5, time 180.23, cls_loss 0.1221 100 cls_loss: tensor(0.0415, device='cuda:0', grad_fn=) cls_loss: tensor(0.0891, device='cuda:0', grad_fn=) cls_loss: tensor(0.0729, device='cuda:0', grad_fn=) cls_loss: tensor(0.2428, device='cuda:0', grad_fn=) cls_loss: tensor(0.2143, device='cuda:0', grad_fn=) cls_loss: tensor(0.1014, device='cuda:0', grad_fn=) cls_loss: tensor(0.1865, device='cuda:0', grad_fn=) cls_loss: tensor(0.1188, device='cuda:0', grad_fn=) cls_loss: tensor(0.2153, device='cuda:0', grad_fn=) cls_loss: tensor(0.0838, device='cuda:0', grad_fn=) cls_loss: tensor(0.0554, device='cuda:0', grad_fn=) cls_loss: tensor(0.0690, device='cuda:0', grad_fn=) cls_loss: tensor(0.1487, device='cuda:0', grad_fn=) cls_loss: tensor(0.1032, device='cuda:0', grad_fn=) cls_loss: tensor(0.1031, device='cuda:0', grad_fn=) cls_loss: tensor(0.2396, device='cuda:0', grad_fn=) cls_loss: tensor(0.0971, device='cuda:0', grad_fn=) cls_loss: tensor(0.0210, device='cuda:0', grad_fn=) cls_loss: tensor(0.1206, device='cuda:0', grad_fn=) cls_loss: tensor(0.0623, device='cuda:0', grad_fn=) cls_loss: tensor(0.0993, device='cuda:0', grad_fn=) cls_loss: tensor(0.2473, device='cuda:0', grad_fn=) cls_loss: tensor(0.0887, device='cuda:0', grad_fn=) cls_loss: tensor(0.2896, device='cuda:0', grad_fn=) cls_loss: tensor(0.0832, device='cuda:0', grad_fn=) cls_loss: tensor(0.1908, device='cuda:0', grad_fn=) cls_loss: tensor(0.1991, device='cuda:0', grad_fn=) cls_loss: tensor(0.2234, device='cuda:0', grad_fn=) cls_loss: tensor(0.2046, device='cuda:0', grad_fn=) cls_loss: tensor(0.0799, device='cuda:0', grad_fn=) cls_loss: tensor(0.0151, device='cuda:0', grad_fn=) cls_loss: tensor(0.0233, device='cuda:0', grad_fn=) cls_loss: tensor(0.1482, device='cuda:0', grad_fn=) cls_loss: tensor(0.3636, device='cuda:0', grad_fn=) cls_loss: tensor(0.1507, device='cuda:0', grad_fn=) cls_loss: tensor(0.2699, device='cuda:0', grad_fn=) cls_loss: tensor(0.2519, device='cuda:0', grad_fn=) cls_loss: tensor(0.0401, device='cuda:0', grad_fn=) cls_loss: tensor(0.0839, device='cuda:0', grad_fn=) cls_loss: tensor(0.0477, device='cuda:0', grad_fn=) cls_loss: tensor(0.0599, device='cuda:0', grad_fn=) cls_loss: tensor(0.1181, device='cuda:0', grad_fn=) cls_loss: tensor(0.0628, device='cuda:0', grad_fn=) cls_loss: tensor(0.1529, device='cuda:0', grad_fn=) cls_loss: tensor(0.0349, device='cuda:0', grad_fn=) cls_loss: tensor(0.0624, device='cuda:0', grad_fn=) cls_loss: tensor(0.1106, device='cuda:0', grad_fn=) cls_loss: tensor(0.2574, device='cuda:0', grad_fn=) cls_loss: tensor(0.0876, device='cuda:0', grad_fn=) cls_loss: tensor(0.1470, device='cuda:0', grad_fn=) cls_loss: tensor(0.1812, device='cuda:0', grad_fn=) cls_loss: tensor(0.1501, device='cuda:0', grad_fn=) cls_loss: tensor(0.0693, device='cuda:0', grad_fn=) cls_loss: tensor(0.2863, device='cuda:0', grad_fn=) cls_loss: tensor(0.2412, device='cuda:0', grad_fn=) cls_loss: tensor(0.0983, device='cuda:0', grad_fn=) cls_loss: tensor(0.0332, device='cuda:0', grad_fn=) cls_loss: tensor(0.0586, device='cuda:0', grad_fn=) cls_loss: tensor(0.1304, device='cuda:0', grad_fn=) cls_loss: tensor(0.1566, device='cuda:0', grad_fn=) cls_loss: tensor(0.1382, device='cuda:0', grad_fn=) cls_loss: tensor(0.0790, device='cuda:0', grad_fn=) cls_loss: tensor(0.1459, device='cuda:0', grad_fn=) cls_loss: tensor(0.0250, device='cuda:0', grad_fn=) cls_loss: tensor(0.0899, device='cuda:0', grad_fn=) cls_loss: tensor(0.1574, device='cuda:0', grad_fn=) cls_loss: tensor(0.0275, device='cuda:0', grad_fn=) cls_loss: tensor(0.1998, device='cuda:0', grad_fn=) cls_loss: tensor(0.0848, device='cuda:0', grad_fn=) cls_loss: tensor(0.0122, device='cuda:0', grad_fn=) cls_loss: tensor(0.0359, device='cuda:0', grad_fn=) cls_loss: tensor(0.1057, device='cuda:0', grad_fn=) cls_loss: tensor(0.0168, device='cuda:0', grad_fn=) cls_loss: tensor(0.0194, device='cuda:0', grad_fn=) cls_loss: tensor(0.1687, device='cuda:0', grad_fn=) cls_loss: tensor(0.2155, device='cuda:0', grad_fn=) cls_loss: tensor(0.0493, device='cuda:0', grad_fn=) cls_loss: tensor(0.0552, device='cuda:0', grad_fn=) cls_loss: tensor(0.1198, device='cuda:0', grad_fn=) cls_loss: tensor(0.1511, device='cuda:0', grad_fn=) cls_loss: tensor(0.0954, device='cuda:0', grad_fn=) cls_loss: tensor(0.1683, device='cuda:0', grad_fn=) cls_loss: tensor(0.0469, device='cuda:0', grad_fn=) cls_loss: tensor(0.1681, device='cuda:0', grad_fn=) cls_loss: tensor(0.0292, device='cuda:0', grad_fn=) cls_loss: tensor(0.1018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0045, device='cuda:0', grad_fn=) cls_loss: tensor(0.0314, device='cuda:0', grad_fn=) cls_loss: tensor(0.2951, device='cuda:0', grad_fn=) cls_loss: tensor(0.0192, device='cuda:0', grad_fn=) cls_loss: tensor(0.1123, device='cuda:0', grad_fn=) cls_loss: tensor(0.2788, device='cuda:0', grad_fn=) cls_loss: tensor(0.1871, device='cuda:0', grad_fn=) cls_loss: tensor(0.0776, device='cuda:0', grad_fn=) cls_loss: tensor(0.2128, device='cuda:0', grad_fn=) cls_loss: tensor(0.1093, device='cuda:0', grad_fn=) cls_loss: tensor(0.3048, device='cuda:0', grad_fn=) cls_loss: tensor(0.0431, device='cuda:0', grad_fn=) cls_loss: tensor(0.1557, device='cuda:0', grad_fn=) cls_loss: tensor(0.2597, device='cuda:0', grad_fn=) 0.0001 changing lr ---------------------saving model at epoch 6---------------------------------------------------- epoch 6, time 179.72, cls_loss 0.1258 100 cls_loss: tensor(0.0251, device='cuda:0', grad_fn=) cls_loss: tensor(0.3583, device='cuda:0', grad_fn=) cls_loss: tensor(0.0905, device='cuda:0', grad_fn=) cls_loss: tensor(0.2031, device='cuda:0', grad_fn=) cls_loss: tensor(0.0424, device='cuda:0', grad_fn=) cls_loss: tensor(0.1086, device='cuda:0', grad_fn=) cls_loss: tensor(0.1000, device='cuda:0', grad_fn=) cls_loss: tensor(0.2734, device='cuda:0', grad_fn=) cls_loss: tensor(0.0417, device='cuda:0', grad_fn=) cls_loss: tensor(0.0149, device='cuda:0', grad_fn=) cls_loss: tensor(0.0411, device='cuda:0', grad_fn=) cls_loss: tensor(0.2023, device='cuda:0', grad_fn=) cls_loss: tensor(0.0859, device='cuda:0', grad_fn=) cls_loss: tensor(0.1139, device='cuda:0', grad_fn=) cls_loss: tensor(0.0642, device='cuda:0', grad_fn=) cls_loss: tensor(0.0186, device='cuda:0', grad_fn=) cls_loss: tensor(0.1033, device='cuda:0', grad_fn=) cls_loss: tensor(0.0477, device='cuda:0', grad_fn=) cls_loss: tensor(0.0994, device='cuda:0', grad_fn=) cls_loss: tensor(0.0504, device='cuda:0', grad_fn=) cls_loss: tensor(0.1158, device='cuda:0', grad_fn=) cls_loss: tensor(0.0306, device='cuda:0', grad_fn=) cls_loss: tensor(0.0259, device='cuda:0', grad_fn=) cls_loss: tensor(0.0921, device='cuda:0', grad_fn=) cls_loss: tensor(0.0881, device='cuda:0', grad_fn=) cls_loss: tensor(0.0185, device='cuda:0', grad_fn=) cls_loss: tensor(0.3668, device='cuda:0', grad_fn=) cls_loss: tensor(0.0592, device='cuda:0', grad_fn=) cls_loss: tensor(0.0980, device='cuda:0', grad_fn=) cls_loss: tensor(0.0079, device='cuda:0', grad_fn=) cls_loss: tensor(0.1378, device='cuda:0', grad_fn=) cls_loss: tensor(0.0761, device='cuda:0', grad_fn=) cls_loss: tensor(0.0185, device='cuda:0', grad_fn=) cls_loss: tensor(0.0142, device='cuda:0', grad_fn=) cls_loss: tensor(0.0113, device='cuda:0', grad_fn=) cls_loss: tensor(0.0565, device='cuda:0', grad_fn=) cls_loss: tensor(0.1267, device='cuda:0', grad_fn=) cls_loss: tensor(0.2118, device='cuda:0', grad_fn=) cls_loss: tensor(0.1089, device='cuda:0', grad_fn=) cls_loss: tensor(0.1214, device='cuda:0', grad_fn=) cls_loss: tensor(0.0210, device='cuda:0', grad_fn=) cls_loss: tensor(0.0089, device='cuda:0', grad_fn=) cls_loss: tensor(0.0990, device='cuda:0', grad_fn=) cls_loss: tensor(0.0698, device='cuda:0', grad_fn=) cls_loss: tensor(0.0354, device='cuda:0', grad_fn=) cls_loss: tensor(0.0177, device='cuda:0', grad_fn=) cls_loss: tensor(0.0898, device='cuda:0', grad_fn=) cls_loss: tensor(0.1833, device='cuda:0', grad_fn=) cls_loss: tensor(0.0291, device='cuda:0', grad_fn=) cls_loss: tensor(0.0802, device='cuda:0', grad_fn=) cls_loss: tensor(0.1628, device='cuda:0', grad_fn=) cls_loss: tensor(0.0117, device='cuda:0', grad_fn=) cls_loss: tensor(0.0396, device='cuda:0', grad_fn=) cls_loss: tensor(0.0531, device='cuda:0', grad_fn=) cls_loss: tensor(0.0466, device='cuda:0', grad_fn=) cls_loss: tensor(0.0326, device='cuda:0', grad_fn=) cls_loss: tensor(0.0320, device='cuda:0', grad_fn=) cls_loss: tensor(0.2706, device='cuda:0', grad_fn=) cls_loss: tensor(0.2776, device='cuda:0', grad_fn=) cls_loss: tensor(0.0077, device='cuda:0', grad_fn=) cls_loss: tensor(0.0437, device='cuda:0', grad_fn=) cls_loss: tensor(0.1720, device='cuda:0', grad_fn=) cls_loss: tensor(0.0184, device='cuda:0', grad_fn=) cls_loss: tensor(0.0307, device='cuda:0', grad_fn=) cls_loss: tensor(0.2881, device='cuda:0', grad_fn=) cls_loss: tensor(0.0136, device='cuda:0', grad_fn=) cls_loss: tensor(0.0879, device='cuda:0', grad_fn=) cls_loss: tensor(0.0165, device='cuda:0', grad_fn=) cls_loss: tensor(0.1080, device='cuda:0', grad_fn=) cls_loss: tensor(0.0297, device='cuda:0', grad_fn=) cls_loss: tensor(0.1115, device='cuda:0', grad_fn=) cls_loss: tensor(0.0424, device='cuda:0', grad_fn=) cls_loss: tensor(0.0654, device='cuda:0', grad_fn=) cls_loss: tensor(0.0739, device='cuda:0', grad_fn=) cls_loss: tensor(0.0461, device='cuda:0', grad_fn=) cls_loss: tensor(0.0624, device='cuda:0', grad_fn=) cls_loss: tensor(0.4801, device='cuda:0', grad_fn=) cls_loss: tensor(0.0354, device='cuda:0', grad_fn=) cls_loss: tensor(0.0297, device='cuda:0', grad_fn=) cls_loss: tensor(0.0340, device='cuda:0', grad_fn=) cls_loss: tensor(0.0124, device='cuda:0', grad_fn=) cls_loss: tensor(0.0403, device='cuda:0', grad_fn=) cls_loss: tensor(0.1887, device='cuda:0', grad_fn=) cls_loss: tensor(0.1058, device='cuda:0', grad_fn=) cls_loss: tensor(0.1034, device='cuda:0', grad_fn=) cls_loss: tensor(0.0400, device='cuda:0', grad_fn=) cls_loss: tensor(0.0316, device='cuda:0', grad_fn=) cls_loss: tensor(0.1178, device='cuda:0', grad_fn=) cls_loss: tensor(0.0712, device='cuda:0', grad_fn=) cls_loss: tensor(0.0084, device='cuda:0', grad_fn=) cls_loss: tensor(0.0604, device='cuda:0', grad_fn=) cls_loss: tensor(0.0183, device='cuda:0', grad_fn=) cls_loss: tensor(0.0629, device='cuda:0', grad_fn=) cls_loss: tensor(0.0292, device='cuda:0', grad_fn=) cls_loss: tensor(0.0336, device='cuda:0', grad_fn=) cls_loss: tensor(0.0504, device='cuda:0', grad_fn=) cls_loss: tensor(0.0344, device='cuda:0', grad_fn=) cls_loss: tensor(0.1241, device='cuda:0', grad_fn=) cls_loss: tensor(0.0276, device='cuda:0', grad_fn=) cls_loss: tensor(0.0525, device='cuda:0', grad_fn=) 0.0001 changing lr ---------------------saving model at epoch 7---------------------------------------------------- epoch 7, time 178.45, cls_loss 0.0844 100 cls_loss: tensor(0.0222, device='cuda:0', grad_fn=) cls_loss: tensor(0.1222, device='cuda:0', grad_fn=) cls_loss: tensor(0.0093, device='cuda:0', grad_fn=) cls_loss: tensor(0.0587, device='cuda:0', grad_fn=) cls_loss: tensor(0.0145, device='cuda:0', grad_fn=) cls_loss: tensor(0.0908, device='cuda:0', grad_fn=) cls_loss: tensor(0.0844, device='cuda:0', grad_fn=) cls_loss: tensor(0.0428, device='cuda:0', grad_fn=) cls_loss: tensor(0.0088, device='cuda:0', grad_fn=) cls_loss: tensor(0.0125, device='cuda:0', grad_fn=) cls_loss: tensor(0.1514, device='cuda:0', grad_fn=) cls_loss: tensor(0.0187, device='cuda:0', grad_fn=) cls_loss: tensor(0.0368, device='cuda:0', grad_fn=) cls_loss: tensor(0.1189, device='cuda:0', grad_fn=) cls_loss: tensor(0.0158, device='cuda:0', grad_fn=) cls_loss: tensor(0.0202, device='cuda:0', grad_fn=) cls_loss: tensor(0.0361, device='cuda:0', grad_fn=) cls_loss: tensor(0.0062, device='cuda:0', grad_fn=) cls_loss: tensor(0.0280, device='cuda:0', grad_fn=) cls_loss: tensor(0.0407, device='cuda:0', grad_fn=) cls_loss: tensor(0.0534, device='cuda:0', grad_fn=) cls_loss: tensor(0.0624, device='cuda:0', grad_fn=) cls_loss: tensor(0.0689, device='cuda:0', grad_fn=) cls_loss: tensor(0.1567, device='cuda:0', grad_fn=) cls_loss: tensor(0.0366, device='cuda:0', grad_fn=) cls_loss: tensor(0.0444, device='cuda:0', grad_fn=) cls_loss: tensor(0.0095, device='cuda:0', grad_fn=) cls_loss: tensor(0.0150, device='cuda:0', grad_fn=) cls_loss: tensor(0.1986, device='cuda:0', grad_fn=) cls_loss: tensor(0.1186, device='cuda:0', grad_fn=) cls_loss: tensor(0.0206, device='cuda:0', grad_fn=) cls_loss: tensor(0.0626, device='cuda:0', grad_fn=) cls_loss: tensor(0.0231, device='cuda:0', grad_fn=) cls_loss: tensor(0.1358, device='cuda:0', grad_fn=) cls_loss: tensor(0.0247, device='cuda:0', grad_fn=) cls_loss: tensor(0.0240, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0601, device='cuda:0', grad_fn=) cls_loss: tensor(0.0183, device='cuda:0', grad_fn=) cls_loss: tensor(0.0963, device='cuda:0', grad_fn=) cls_loss: tensor(0.0120, device='cuda:0', grad_fn=) cls_loss: tensor(0.1810, device='cuda:0', grad_fn=) cls_loss: tensor(0.0310, device='cuda:0', grad_fn=) cls_loss: tensor(0.1097, device='cuda:0', grad_fn=) cls_loss: tensor(0.2772, device='cuda:0', grad_fn=) cls_loss: tensor(0.0096, device='cuda:0', grad_fn=) cls_loss: tensor(0.1783, device='cuda:0', grad_fn=) cls_loss: tensor(0.5294, device='cuda:0', grad_fn=) cls_loss: tensor(0.1214, device='cuda:0', grad_fn=) cls_loss: tensor(0.0833, device='cuda:0', grad_fn=) cls_loss: tensor(0.0795, device='cuda:0', grad_fn=) cls_loss: tensor(0.1116, device='cuda:0', grad_fn=) cls_loss: tensor(0.0786, device='cuda:0', grad_fn=) cls_loss: tensor(0.0054, device='cuda:0', grad_fn=) cls_loss: tensor(0.1099, device='cuda:0', grad_fn=) cls_loss: tensor(0.3203, device='cuda:0', grad_fn=) cls_loss: tensor(0.0519, device='cuda:0', grad_fn=) cls_loss: tensor(0.5539, device='cuda:0', grad_fn=) cls_loss: tensor(0.1927, device='cuda:0', grad_fn=) cls_loss: tensor(0.1404, device='cuda:0', grad_fn=) cls_loss: tensor(0.1236, device='cuda:0', grad_fn=) cls_loss: tensor(0.0673, device='cuda:0', grad_fn=) cls_loss: tensor(0.0191, device='cuda:0', grad_fn=) cls_loss: tensor(0.0451, device='cuda:0', grad_fn=) cls_loss: tensor(0.0230, device='cuda:0', grad_fn=) cls_loss: tensor(0.1482, device='cuda:0', grad_fn=) cls_loss: tensor(0.0221, device='cuda:0', grad_fn=) cls_loss: tensor(0.0062, device='cuda:0', grad_fn=) cls_loss: tensor(0.2527, device='cuda:0', grad_fn=) cls_loss: tensor(0.0160, device='cuda:0', grad_fn=) cls_loss: tensor(0.0576, device='cuda:0', grad_fn=) cls_loss: tensor(0.0804, device='cuda:0', grad_fn=) cls_loss: tensor(0.0173, device='cuda:0', grad_fn=) cls_loss: tensor(0.0460, device='cuda:0', grad_fn=) cls_loss: tensor(0.1177, device='cuda:0', grad_fn=) cls_loss: tensor(0.0221, device='cuda:0', grad_fn=) cls_loss: tensor(0.1030, device='cuda:0', grad_fn=) cls_loss: tensor(0.0206, device='cuda:0', grad_fn=) cls_loss: tensor(0.0952, device='cuda:0', grad_fn=) cls_loss: tensor(0.0178, device='cuda:0', grad_fn=) cls_loss: tensor(0.1099, device='cuda:0', grad_fn=) cls_loss: tensor(0.0147, device='cuda:0', grad_fn=) cls_loss: tensor(0.0472, device='cuda:0', grad_fn=) cls_loss: tensor(0.0219, device='cuda:0', grad_fn=) cls_loss: tensor(0.0885, device='cuda:0', grad_fn=) cls_loss: tensor(0.0248, device='cuda:0', grad_fn=) cls_loss: tensor(0.0050, device='cuda:0', grad_fn=) cls_loss: tensor(0.0145, device='cuda:0', grad_fn=) cls_loss: tensor(0.0442, device='cuda:0', grad_fn=) cls_loss: tensor(0.0279, device='cuda:0', grad_fn=) cls_loss: tensor(0.0561, device='cuda:0', grad_fn=) cls_loss: tensor(0.2315, device='cuda:0', grad_fn=) cls_loss: tensor(0.0279, device='cuda:0', grad_fn=) cls_loss: tensor(0.0152, device='cuda:0', grad_fn=) cls_loss: tensor(0.0046, device='cuda:0', grad_fn=) cls_loss: tensor(0.0361, device='cuda:0', grad_fn=) cls_loss: tensor(0.0070, device='cuda:0', grad_fn=) cls_loss: tensor(0.0093, device='cuda:0', grad_fn=) cls_loss: tensor(0.0095, device='cuda:0', grad_fn=) cls_loss: tensor(0.0260, device='cuda:0', grad_fn=) 0.0001 changing lr ---------------------saving model at epoch 8---------------------------------------------------- epoch 8, time 178.44, cls_loss 0.0747 100 cls_loss: tensor(0.2122, device='cuda:0', grad_fn=) cls_loss: tensor(0.0633, device='cuda:0', grad_fn=) cls_loss: tensor(0.0692, device='cuda:0', grad_fn=) cls_loss: tensor(0.0144, device='cuda:0', grad_fn=) cls_loss: tensor(0.0678, device='cuda:0', grad_fn=) cls_loss: tensor(0.0137, device='cuda:0', grad_fn=) cls_loss: tensor(0.0110, device='cuda:0', grad_fn=) cls_loss: tensor(0.0384, device='cuda:0', grad_fn=) cls_loss: tensor(0.1975, device='cuda:0', grad_fn=) cls_loss: tensor(0.0762, device='cuda:0', grad_fn=) cls_loss: tensor(0.0740, device='cuda:0', grad_fn=) cls_loss: tensor(0.1288, device='cuda:0', grad_fn=) cls_loss: tensor(0.1041, device='cuda:0', grad_fn=) cls_loss: tensor(0.0324, device='cuda:0', grad_fn=) cls_loss: tensor(0.2125, device='cuda:0', grad_fn=) cls_loss: tensor(0.1114, device='cuda:0', grad_fn=) cls_loss: tensor(0.0101, device='cuda:0', grad_fn=) cls_loss: tensor(0.0710, device='cuda:0', grad_fn=) cls_loss: tensor(0.0850, device='cuda:0', grad_fn=) cls_loss: tensor(0.0124, device='cuda:0', grad_fn=) cls_loss: tensor(0.0573, device='cuda:0', grad_fn=) cls_loss: tensor(0.1163, device='cuda:0', grad_fn=) cls_loss: tensor(0.0256, device='cuda:0', grad_fn=) cls_loss: tensor(0.1087, device='cuda:0', grad_fn=) cls_loss: tensor(0.0209, device='cuda:0', grad_fn=) cls_loss: tensor(0.1678, device='cuda:0', grad_fn=) cls_loss: tensor(0.0777, device='cuda:0', grad_fn=) cls_loss: tensor(0.0151, device='cuda:0', grad_fn=) cls_loss: tensor(0.0125, device='cuda:0', grad_fn=) cls_loss: tensor(0.0282, device='cuda:0', grad_fn=) cls_loss: tensor(0.0384, device='cuda:0', grad_fn=) cls_loss: tensor(0.0044, device='cuda:0', grad_fn=) cls_loss: tensor(0.0599, device='cuda:0', grad_fn=) cls_loss: tensor(0.0259, device='cuda:0', grad_fn=) cls_loss: tensor(0.0255, device='cuda:0', grad_fn=) cls_loss: tensor(0.0357, device='cuda:0', grad_fn=) cls_loss: tensor(0.0061, device='cuda:0', grad_fn=) cls_loss: tensor(0.2077, device='cuda:0', grad_fn=) cls_loss: tensor(0.1506, device='cuda:0', grad_fn=) cls_loss: tensor(0.0246, device='cuda:0', grad_fn=) cls_loss: tensor(0.2023, device='cuda:0', grad_fn=) cls_loss: tensor(0.2166, device='cuda:0', grad_fn=) cls_loss: tensor(0.0248, device='cuda:0', grad_fn=) cls_loss: tensor(0.0389, device='cuda:0', grad_fn=) cls_loss: tensor(0.1267, device='cuda:0', grad_fn=) cls_loss: tensor(0.0473, device='cuda:0', grad_fn=) cls_loss: tensor(0.0048, device='cuda:0', grad_fn=) cls_loss: tensor(0.1382, device='cuda:0', grad_fn=) cls_loss: tensor(0.0123, device='cuda:0', grad_fn=) cls_loss: tensor(0.1094, device='cuda:0', grad_fn=) cls_loss: tensor(0.0064, device='cuda:0', grad_fn=) cls_loss: tensor(0.0620, device='cuda:0', grad_fn=) cls_loss: tensor(0.1052, device='cuda:0', grad_fn=) cls_loss: tensor(0.0074, device='cuda:0', grad_fn=) cls_loss: tensor(0.0413, device='cuda:0', grad_fn=) cls_loss: tensor(0.1606, device='cuda:0', grad_fn=) cls_loss: tensor(0.0824, device='cuda:0', grad_fn=) cls_loss: tensor(0.0896, device='cuda:0', grad_fn=) cls_loss: tensor(0.0195, device='cuda:0', grad_fn=) cls_loss: tensor(0.0811, device='cuda:0', grad_fn=) cls_loss: tensor(0.0117, device='cuda:0', grad_fn=) cls_loss: tensor(0.0239, device='cuda:0', grad_fn=) cls_loss: tensor(0.0523, device='cuda:0', grad_fn=) cls_loss: tensor(0.0396, device='cuda:0', grad_fn=) cls_loss: tensor(0.0289, device='cuda:0', grad_fn=) cls_loss: tensor(0.1115, device='cuda:0', grad_fn=) cls_loss: tensor(0.2149, device='cuda:0', grad_fn=) cls_loss: tensor(0.0721, device='cuda:0', grad_fn=) cls_loss: tensor(0.0997, device='cuda:0', grad_fn=) cls_loss: tensor(0.0491, device='cuda:0', grad_fn=) cls_loss: tensor(0.0111, device='cuda:0', grad_fn=) cls_loss: tensor(0.0133, device='cuda:0', grad_fn=) cls_loss: tensor(0.0538, device='cuda:0', grad_fn=) cls_loss: tensor(0.0075, device='cuda:0', grad_fn=) cls_loss: tensor(0.0500, device='cuda:0', grad_fn=) cls_loss: tensor(0.1539, device='cuda:0', grad_fn=) cls_loss: tensor(0.0859, device='cuda:0', grad_fn=) cls_loss: tensor(0.0064, device='cuda:0', grad_fn=) cls_loss: tensor(0.0134, device='cuda:0', grad_fn=) cls_loss: tensor(0.1171, device='cuda:0', grad_fn=) cls_loss: tensor(0.0111, device='cuda:0', grad_fn=) cls_loss: tensor(0.0294, device='cuda:0', grad_fn=) cls_loss: tensor(0.0128, device='cuda:0', grad_fn=) cls_loss: tensor(0.0303, device='cuda:0', grad_fn=) cls_loss: tensor(0.0382, device='cuda:0', grad_fn=) cls_loss: tensor(0.0405, device='cuda:0', grad_fn=) cls_loss: tensor(0.0543, device='cuda:0', grad_fn=) cls_loss: tensor(0.0085, device='cuda:0', grad_fn=) cls_loss: tensor(0.0467, device='cuda:0', grad_fn=) cls_loss: tensor(0.1006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0469, device='cuda:0', grad_fn=) cls_loss: tensor(0.0172, device='cuda:0', grad_fn=) cls_loss: tensor(0.0337, device='cuda:0', grad_fn=) cls_loss: tensor(0.0542, device='cuda:0', grad_fn=) cls_loss: tensor(0.1017, device='cuda:0', grad_fn=) cls_loss: tensor(0.0218, device='cuda:0', grad_fn=) cls_loss: tensor(0.0088, device='cuda:0', grad_fn=) cls_loss: tensor(0.0072, device='cuda:0', grad_fn=) cls_loss: tensor(0.0219, device='cuda:0', grad_fn=) cls_loss: tensor(0.2915, device='cuda:0', grad_fn=) 0.0001 changing lr epoch 9, time 178.73, cls_loss 0.0658 100 cls_loss: tensor(0.0098, device='cuda:0', grad_fn=) cls_loss: tensor(0.1471, device='cuda:0', grad_fn=) cls_loss: tensor(0.1243, device='cuda:0', grad_fn=) cls_loss: tensor(0.0237, device='cuda:0', grad_fn=) cls_loss: tensor(0.0267, device='cuda:0', grad_fn=) cls_loss: tensor(0.0973, device='cuda:0', grad_fn=) cls_loss: tensor(0.0510, device='cuda:0', grad_fn=) cls_loss: tensor(0.0372, device='cuda:0', grad_fn=) cls_loss: tensor(0.0265, device='cuda:0', grad_fn=) cls_loss: tensor(0.0764, device='cuda:0', grad_fn=) cls_loss: tensor(0.1684, device='cuda:0', grad_fn=) cls_loss: tensor(0.0136, device='cuda:0', grad_fn=) cls_loss: tensor(0.0095, device='cuda:0', grad_fn=) cls_loss: tensor(0.0412, device='cuda:0', grad_fn=) cls_loss: tensor(0.0109, device='cuda:0', grad_fn=) cls_loss: tensor(0.0178, device='cuda:0', grad_fn=) cls_loss: tensor(0.0681, device='cuda:0', grad_fn=) cls_loss: tensor(0.0113, device='cuda:0', grad_fn=) cls_loss: tensor(0.0270, device='cuda:0', grad_fn=) cls_loss: tensor(0.0188, device='cuda:0', grad_fn=) cls_loss: tensor(0.0220, device='cuda:0', grad_fn=) cls_loss: tensor(0.0160, device='cuda:0', grad_fn=) cls_loss: tensor(0.0388, device='cuda:0', grad_fn=) cls_loss: tensor(0.1616, device='cuda:0', grad_fn=) cls_loss: tensor(0.1322, device='cuda:0', grad_fn=) cls_loss: tensor(0.0045, device='cuda:0', grad_fn=) cls_loss: tensor(0.0111, device='cuda:0', grad_fn=) cls_loss: tensor(0.0651, device='cuda:0', grad_fn=) cls_loss: tensor(0.0144, device='cuda:0', grad_fn=) cls_loss: tensor(0.3520, device='cuda:0', grad_fn=) cls_loss: tensor(0.0488, device='cuda:0', grad_fn=) cls_loss: tensor(0.0055, device='cuda:0', grad_fn=) cls_loss: tensor(0.0137, device='cuda:0', grad_fn=) cls_loss: tensor(0.0383, device='cuda:0', grad_fn=) cls_loss: tensor(0.0322, device='cuda:0', grad_fn=) cls_loss: tensor(0.0213, device='cuda:0', grad_fn=) cls_loss: tensor(0.0054, device='cuda:0', grad_fn=) cls_loss: tensor(0.1807, device='cuda:0', grad_fn=) cls_loss: tensor(0.1116, device='cuda:0', grad_fn=) cls_loss: tensor(0.0247, device='cuda:0', grad_fn=) cls_loss: tensor(0.1767, device='cuda:0', grad_fn=) cls_loss: tensor(0.1455, device='cuda:0', grad_fn=) cls_loss: tensor(0.0106, device='cuda:0', grad_fn=) cls_loss: tensor(0.0675, device='cuda:0', grad_fn=) cls_loss: tensor(0.0898, device='cuda:0', grad_fn=) cls_loss: tensor(0.0292, device='cuda:0', grad_fn=) cls_loss: tensor(0.0243, device='cuda:0', grad_fn=) cls_loss: tensor(0.0364, device='cuda:0', grad_fn=) cls_loss: tensor(0.0110, device='cuda:0', grad_fn=) cls_loss: tensor(0.1898, device='cuda:0', grad_fn=) cls_loss: tensor(0.1661, device='cuda:0', grad_fn=) cls_loss: tensor(0.0209, device='cuda:0', grad_fn=) cls_loss: tensor(0.0078, device='cuda:0', grad_fn=) cls_loss: tensor(0.0612, device='cuda:0', grad_fn=) cls_loss: tensor(0.0107, device='cuda:0', grad_fn=) cls_loss: tensor(0.0190, device='cuda:0', grad_fn=) cls_loss: tensor(0.0731, device='cuda:0', grad_fn=) cls_loss: tensor(0.2544, device='cuda:0', grad_fn=) cls_loss: tensor(0.0283, device='cuda:0', grad_fn=) cls_loss: tensor(0.0163, device='cuda:0', grad_fn=) cls_loss: tensor(0.2327, device='cuda:0', grad_fn=) cls_loss: tensor(0.0620, device='cuda:0', grad_fn=) cls_loss: tensor(0.0304, device='cuda:0', grad_fn=) cls_loss: tensor(0.0247, device='cuda:0', grad_fn=) cls_loss: tensor(0.0188, device='cuda:0', grad_fn=) cls_loss: tensor(0.0947, device='cuda:0', grad_fn=) cls_loss: tensor(0.1053, device='cuda:0', grad_fn=) cls_loss: tensor(0.0321, device='cuda:0', grad_fn=) cls_loss: tensor(0.0697, device='cuda:0', grad_fn=) cls_loss: tensor(0.2070, device='cuda:0', grad_fn=) cls_loss: tensor(0.0293, device='cuda:0', grad_fn=) cls_loss: tensor(0.0142, device='cuda:0', grad_fn=) cls_loss: tensor(0.0551, device='cuda:0', grad_fn=) cls_loss: tensor(0.0331, device='cuda:0', grad_fn=) cls_loss: tensor(0.0418, device='cuda:0', grad_fn=) cls_loss: tensor(0.1081, device='cuda:0', grad_fn=) cls_loss: tensor(0.0052, device='cuda:0', grad_fn=) cls_loss: tensor(0.0281, device='cuda:0', grad_fn=) cls_loss: tensor(0.1936, device='cuda:0', grad_fn=) cls_loss: tensor(0.0282, device='cuda:0', grad_fn=) cls_loss: tensor(0.0317, device='cuda:0', grad_fn=) cls_loss: tensor(0.1039, device='cuda:0', grad_fn=) cls_loss: tensor(0.0119, device='cuda:0', grad_fn=) cls_loss: tensor(0.0260, device='cuda:0', grad_fn=) cls_loss: tensor(0.0670, device='cuda:0', grad_fn=) cls_loss: tensor(0.0362, device='cuda:0', grad_fn=) cls_loss: tensor(0.1240, device='cuda:0', grad_fn=) cls_loss: tensor(0.0141, device='cuda:0', grad_fn=) cls_loss: tensor(0.0385, device='cuda:0', grad_fn=) cls_loss: tensor(0.1084, device='cuda:0', grad_fn=) cls_loss: tensor(0.0148, device='cuda:0', grad_fn=) cls_loss: tensor(0.0095, device='cuda:0', grad_fn=) cls_loss: tensor(0.0331, device='cuda:0', grad_fn=) cls_loss: tensor(0.0132, device='cuda:0', grad_fn=) cls_loss: tensor(0.0594, device='cuda:0', grad_fn=) cls_loss: tensor(0.1436, device='cuda:0', grad_fn=) cls_loss: tensor(0.0065, device='cuda:0', grad_fn=) cls_loss: tensor(0.0078, device='cuda:0', grad_fn=) cls_loss: tensor(0.2029, device='cuda:0', grad_fn=) cls_loss: tensor(0.0847, device='cuda:0', grad_fn=) 0.0001 changing lr ---------------------saving model at epoch 10---------------------------------------------------- epoch 10, time 178.65, cls_loss 0.0630 100 cls_loss: tensor(0.1233, device='cuda:0', grad_fn=) cls_loss: tensor(0.1535, device='cuda:0', grad_fn=) cls_loss: tensor(0.0064, device='cuda:0', grad_fn=) cls_loss: tensor(0.0412, device='cuda:0', grad_fn=) cls_loss: tensor(0.0566, device='cuda:0', grad_fn=) cls_loss: tensor(0.0141, device='cuda:0', grad_fn=) cls_loss: tensor(0.0418, device='cuda:0', grad_fn=) cls_loss: tensor(0.0603, device='cuda:0', grad_fn=) cls_loss: tensor(0.1615, device='cuda:0', grad_fn=) cls_loss: tensor(0.0185, device='cuda:0', grad_fn=) cls_loss: tensor(0.0699, device='cuda:0', grad_fn=) cls_loss: tensor(0.0913, device='cuda:0', grad_fn=) cls_loss: tensor(0.1582, device='cuda:0', grad_fn=) cls_loss: tensor(0.1057, device='cuda:0', grad_fn=) cls_loss: tensor(0.0262, device='cuda:0', grad_fn=) cls_loss: tensor(0.0331, device='cuda:0', grad_fn=) cls_loss: tensor(0.0265, device='cuda:0', grad_fn=) cls_loss: tensor(0.3086, device='cuda:0', grad_fn=) cls_loss: tensor(0.0120, device='cuda:0', grad_fn=) cls_loss: tensor(0.2157, device='cuda:0', grad_fn=) cls_loss: tensor(0.1471, device='cuda:0', grad_fn=) cls_loss: tensor(0.0305, device='cuda:0', grad_fn=) cls_loss: tensor(0.1217, device='cuda:0', grad_fn=) cls_loss: tensor(0.1538, device='cuda:0', grad_fn=) cls_loss: tensor(0.0236, device='cuda:0', grad_fn=) cls_loss: tensor(0.0600, device='cuda:0', grad_fn=) cls_loss: tensor(0.0137, device='cuda:0', grad_fn=) cls_loss: tensor(0.0173, device='cuda:0', grad_fn=) cls_loss: tensor(0.2352, device='cuda:0', grad_fn=) cls_loss: tensor(0.2526, device='cuda:0', grad_fn=) cls_loss: tensor(0.0833, device='cuda:0', grad_fn=) cls_loss: tensor(0.0446, device='cuda:0', grad_fn=) cls_loss: tensor(0.1051, device='cuda:0', grad_fn=) cls_loss: tensor(0.0229, device='cuda:0', grad_fn=) cls_loss: tensor(0.0128, device='cuda:0', grad_fn=) cls_loss: tensor(0.0325, device='cuda:0', grad_fn=) cls_loss: tensor(0.2524, device='cuda:0', grad_fn=) cls_loss: tensor(0.1137, device='cuda:0', grad_fn=) cls_loss: tensor(0.0263, device='cuda:0', grad_fn=) cls_loss: tensor(0.1749, device='cuda:0', grad_fn=) cls_loss: tensor(0.0233, device='cuda:0', grad_fn=) cls_loss: tensor(0.0286, device='cuda:0', grad_fn=) cls_loss: tensor(0.0164, device='cuda:0', grad_fn=) cls_loss: tensor(0.0155, device='cuda:0', grad_fn=) cls_loss: tensor(0.0196, device='cuda:0', grad_fn=) cls_loss: tensor(0.0431, device='cuda:0', grad_fn=) cls_loss: tensor(0.0344, device='cuda:0', grad_fn=) cls_loss: tensor(0.0102, device='cuda:0', grad_fn=) cls_loss: tensor(0.2531, device='cuda:0', grad_fn=) cls_loss: tensor(0.1044, device='cuda:0', grad_fn=) cls_loss: tensor(0.1554, device='cuda:0', grad_fn=) cls_loss: tensor(0.0805, device='cuda:0', grad_fn=) cls_loss: tensor(0.0097, device='cuda:0', grad_fn=) cls_loss: tensor(0.0532, device='cuda:0', grad_fn=) cls_loss: tensor(0.2432, device='cuda:0', grad_fn=) cls_loss: tensor(0.0120, device='cuda:0', grad_fn=) cls_loss: tensor(0.1455, device='cuda:0', grad_fn=) cls_loss: tensor(0.0308, device='cuda:0', grad_fn=) cls_loss: tensor(0.0351, device='cuda:0', grad_fn=) cls_loss: tensor(0.0163, device='cuda:0', grad_fn=) cls_loss: tensor(0.0185, device='cuda:0', grad_fn=) cls_loss: tensor(0.0157, device='cuda:0', grad_fn=) cls_loss: tensor(0.0978, device='cuda:0', grad_fn=) cls_loss: tensor(0.0657, device='cuda:0', grad_fn=) cls_loss: tensor(0.0389, device='cuda:0', grad_fn=) cls_loss: tensor(0.0216, device='cuda:0', grad_fn=) cls_loss: tensor(0.0930, device='cuda:0', grad_fn=) cls_loss: tensor(0.0604, device='cuda:0', grad_fn=) cls_loss: tensor(0.0886, device='cuda:0', grad_fn=) cls_loss: tensor(0.0346, device='cuda:0', grad_fn=) cls_loss: tensor(0.0190, device='cuda:0', grad_fn=) cls_loss: tensor(0.0047, device='cuda:0', grad_fn=) cls_loss: tensor(0.0036, device='cuda:0', grad_fn=) cls_loss: tensor(0.0069, device='cuda:0', grad_fn=) cls_loss: tensor(0.1006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(0.1293, device='cuda:0', grad_fn=) cls_loss: tensor(0.0221, device='cuda:0', grad_fn=) cls_loss: tensor(0.0155, device='cuda:0', grad_fn=) cls_loss: tensor(0.0426, device='cuda:0', grad_fn=) cls_loss: tensor(0.0484, device='cuda:0', grad_fn=) cls_loss: tensor(0.0237, device='cuda:0', grad_fn=) cls_loss: tensor(0.1016, device='cuda:0', grad_fn=) cls_loss: tensor(0.1096, device='cuda:0', grad_fn=) cls_loss: tensor(0.0361, device='cuda:0', grad_fn=) cls_loss: tensor(0.0409, device='cuda:0', grad_fn=) cls_loss: tensor(0.4445, device='cuda:0', grad_fn=) cls_loss: tensor(0.0165, device='cuda:0', grad_fn=) cls_loss: tensor(0.0728, device='cuda:0', grad_fn=) cls_loss: tensor(0.0664, device='cuda:0', grad_fn=) cls_loss: tensor(0.0229, device='cuda:0', grad_fn=) cls_loss: tensor(0.0208, device='cuda:0', grad_fn=) cls_loss: tensor(0.0065, device='cuda:0', grad_fn=) cls_loss: tensor(0.0352, device='cuda:0', grad_fn=) cls_loss: tensor(0.0337, device='cuda:0', grad_fn=) cls_loss: tensor(0.0300, device='cuda:0', grad_fn=) cls_loss: tensor(0.0076, device='cuda:0', grad_fn=) cls_loss: tensor(0.0107, device='cuda:0', grad_fn=) cls_loss: tensor(0.0987, device='cuda:0', grad_fn=) cls_loss: tensor(0.2269, device='cuda:0', grad_fn=) 0.0001 changing lr epoch 11, time 178.47, cls_loss 0.0731 100 cls_loss: tensor(0.0491, device='cuda:0', grad_fn=) cls_loss: tensor(0.0192, device='cuda:0', grad_fn=) cls_loss: tensor(0.1590, device='cuda:0', grad_fn=) cls_loss: tensor(0.0670, device='cuda:0', grad_fn=) cls_loss: tensor(0.0150, device='cuda:0', grad_fn=) cls_loss: tensor(0.2276, device='cuda:0', grad_fn=) cls_loss: tensor(0.0204, device='cuda:0', grad_fn=) cls_loss: tensor(0.1331, device='cuda:0', grad_fn=) cls_loss: tensor(0.0167, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(0.1663, device='cuda:0', grad_fn=) cls_loss: tensor(0.0402, device='cuda:0', grad_fn=) cls_loss: tensor(0.0165, device='cuda:0', grad_fn=) cls_loss: tensor(0.0307, device='cuda:0', grad_fn=) cls_loss: tensor(0.0117, device='cuda:0', grad_fn=) cls_loss: tensor(0.0378, device='cuda:0', grad_fn=) cls_loss: tensor(0.0077, device='cuda:0', grad_fn=) cls_loss: tensor(0.0129, device='cuda:0', grad_fn=) cls_loss: tensor(0.1353, device='cuda:0', grad_fn=) cls_loss: tensor(0.0823, device='cuda:0', grad_fn=) cls_loss: tensor(0.1035, device='cuda:0', grad_fn=) cls_loss: tensor(0.0781, device='cuda:0', grad_fn=) cls_loss: tensor(0.2191, device='cuda:0', grad_fn=) cls_loss: tensor(0.0370, device='cuda:0', grad_fn=) cls_loss: tensor(0.0145, device='cuda:0', grad_fn=) cls_loss: tensor(0.0043, device='cuda:0', grad_fn=) cls_loss: tensor(0.0691, device='cuda:0', grad_fn=) cls_loss: tensor(0.0087, device='cuda:0', grad_fn=) cls_loss: tensor(0.0196, device='cuda:0', grad_fn=) cls_loss: tensor(0.0954, device='cuda:0', grad_fn=) cls_loss: tensor(0.0124, device='cuda:0', grad_fn=) cls_loss: tensor(0.0604, device='cuda:0', grad_fn=) cls_loss: tensor(0.2066, device='cuda:0', grad_fn=) cls_loss: tensor(0.0497, device='cuda:0', grad_fn=) cls_loss: tensor(0.1246, device='cuda:0', grad_fn=) cls_loss: tensor(0.0583, device='cuda:0', grad_fn=) cls_loss: tensor(0.0353, device='cuda:0', grad_fn=) cls_loss: tensor(0.1685, device='cuda:0', grad_fn=) cls_loss: tensor(0.0108, device='cuda:0', grad_fn=) cls_loss: tensor(0.0200, device='cuda:0', grad_fn=) cls_loss: tensor(0.0350, device='cuda:0', grad_fn=) cls_loss: tensor(0.1684, device='cuda:0', grad_fn=) cls_loss: tensor(0.0552, device='cuda:0', grad_fn=) cls_loss: tensor(0.0319, device='cuda:0', grad_fn=) cls_loss: tensor(0.0751, device='cuda:0', grad_fn=) cls_loss: tensor(0.0142, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(0.0137, device='cuda:0', grad_fn=) cls_loss: tensor(0.0824, device='cuda:0', grad_fn=) cls_loss: tensor(0.0805, device='cuda:0', grad_fn=) cls_loss: tensor(0.0644, device='cuda:0', grad_fn=) cls_loss: tensor(0.0369, device='cuda:0', grad_fn=) cls_loss: tensor(0.0456, device='cuda:0', grad_fn=) cls_loss: tensor(0.2754, device='cuda:0', grad_fn=) cls_loss: tensor(0.0629, device='cuda:0', grad_fn=) cls_loss: tensor(0.1936, device='cuda:0', grad_fn=) cls_loss: tensor(0.0268, device='cuda:0', grad_fn=) cls_loss: tensor(0.0073, device='cuda:0', grad_fn=) cls_loss: tensor(0.0406, device='cuda:0', grad_fn=) cls_loss: tensor(0.0101, device='cuda:0', grad_fn=) cls_loss: tensor(0.0224, device='cuda:0', grad_fn=) cls_loss: tensor(0.0289, device='cuda:0', grad_fn=) cls_loss: tensor(0.0751, device='cuda:0', grad_fn=) cls_loss: tensor(0.2606, device='cuda:0', grad_fn=) cls_loss: tensor(0.1249, device='cuda:0', grad_fn=) cls_loss: tensor(0.0082, device='cuda:0', grad_fn=) cls_loss: tensor(0.0573, device='cuda:0', grad_fn=) cls_loss: tensor(0.0079, device='cuda:0', grad_fn=) cls_loss: tensor(0.0423, device='cuda:0', grad_fn=) cls_loss: tensor(0.0857, device='cuda:0', grad_fn=) cls_loss: tensor(0.0204, device='cuda:0', grad_fn=) cls_loss: tensor(0.0077, device='cuda:0', grad_fn=) cls_loss: tensor(0.0201, device='cuda:0', grad_fn=) cls_loss: tensor(0.0576, device='cuda:0', grad_fn=) cls_loss: tensor(0.0408, device='cuda:0', grad_fn=) cls_loss: tensor(0.1176, device='cuda:0', grad_fn=) cls_loss: tensor(0.0275, device='cuda:0', grad_fn=) cls_loss: tensor(0.0153, device='cuda:0', grad_fn=) cls_loss: tensor(0.0678, device='cuda:0', grad_fn=) cls_loss: tensor(0.0598, device='cuda:0', grad_fn=) cls_loss: tensor(0.0268, device='cuda:0', grad_fn=) cls_loss: tensor(0.0693, device='cuda:0', grad_fn=) cls_loss: tensor(0.0168, device='cuda:0', grad_fn=) cls_loss: tensor(0.0260, device='cuda:0', grad_fn=) cls_loss: tensor(0.0103, device='cuda:0', grad_fn=) cls_loss: tensor(0.0117, device='cuda:0', grad_fn=) cls_loss: tensor(0.0322, device='cuda:0', grad_fn=) cls_loss: tensor(0.0083, device='cuda:0', grad_fn=) cls_loss: tensor(0.0092, device='cuda:0', grad_fn=) cls_loss: tensor(0.0275, device='cuda:0', grad_fn=) cls_loss: tensor(0.0621, device='cuda:0', grad_fn=) cls_loss: tensor(0.0209, device='cuda:0', grad_fn=) cls_loss: tensor(0.0531, device='cuda:0', grad_fn=) cls_loss: tensor(0.0461, device='cuda:0', grad_fn=) cls_loss: tensor(0.0141, device='cuda:0', grad_fn=) cls_loss: tensor(0.0303, device='cuda:0', grad_fn=) cls_loss: tensor(0.0210, device='cuda:0', grad_fn=) cls_loss: tensor(0.0105, device='cuda:0', grad_fn=) cls_loss: tensor(0.0033, device='cuda:0', grad_fn=) cls_loss: tensor(0.3584, device='cuda:0', grad_fn=) 0.0001 changing lr ---------------------saving model at epoch 12---------------------------------------------------- epoch 12, time 178.58, cls_loss 0.0598 100 cls_loss: tensor(0.0068, device='cuda:0', grad_fn=) cls_loss: tensor(0.0140, device='cuda:0', grad_fn=) cls_loss: tensor(0.0090, device='cuda:0', grad_fn=) cls_loss: tensor(0.0771, device='cuda:0', grad_fn=) cls_loss: tensor(0.0132, device='cuda:0', grad_fn=) cls_loss: tensor(0.0186, device='cuda:0', grad_fn=) cls_loss: tensor(0.0145, device='cuda:0', grad_fn=) cls_loss: tensor(0.0461, device='cuda:0', grad_fn=) cls_loss: tensor(0.0483, device='cuda:0', grad_fn=) cls_loss: tensor(0.0407, device='cuda:0', grad_fn=) cls_loss: tensor(0.0319, device='cuda:0', grad_fn=) cls_loss: tensor(0.0034, device='cuda:0', grad_fn=) cls_loss: tensor(0.0081, device='cuda:0', grad_fn=) cls_loss: tensor(0.0267, device='cuda:0', grad_fn=) cls_loss: tensor(0.1260, device='cuda:0', grad_fn=) cls_loss: tensor(0.0159, device='cuda:0', grad_fn=) cls_loss: tensor(0.0245, device='cuda:0', grad_fn=) cls_loss: tensor(0.0137, device='cuda:0', grad_fn=) cls_loss: tensor(0.0820, device='cuda:0', grad_fn=) cls_loss: tensor(0.0133, device='cuda:0', grad_fn=) cls_loss: tensor(0.0111, device='cuda:0', grad_fn=) cls_loss: tensor(0.0215, device='cuda:0', grad_fn=) cls_loss: tensor(0.0157, device='cuda:0', grad_fn=) cls_loss: tensor(0.0368, device='cuda:0', grad_fn=) cls_loss: tensor(0.0040, device='cuda:0', grad_fn=) cls_loss: tensor(0.0190, device='cuda:0', grad_fn=) cls_loss: tensor(0.0289, device='cuda:0', grad_fn=) cls_loss: tensor(0.0203, device='cuda:0', grad_fn=) cls_loss: tensor(0.0055, device='cuda:0', grad_fn=) cls_loss: tensor(0.0095, device='cuda:0', grad_fn=) cls_loss: tensor(0.0098, device='cuda:0', grad_fn=) cls_loss: tensor(0.0029, device='cuda:0', grad_fn=) cls_loss: tensor(0.1173, device='cuda:0', grad_fn=) cls_loss: tensor(0.0280, device='cuda:0', grad_fn=) cls_loss: tensor(0.0073, device='cuda:0', grad_fn=) cls_loss: tensor(0.0122, device='cuda:0', grad_fn=) cls_loss: tensor(0.0484, device='cuda:0', grad_fn=) cls_loss: tensor(0.0088, device='cuda:0', grad_fn=) cls_loss: tensor(0.2532, device='cuda:0', grad_fn=) cls_loss: tensor(0.0132, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(0.0053, device='cuda:0', grad_fn=) cls_loss: tensor(0.0295, device='cuda:0', grad_fn=) cls_loss: tensor(0.1514, device='cuda:0', grad_fn=) cls_loss: tensor(0.0653, device='cuda:0', grad_fn=) cls_loss: tensor(0.2167, device='cuda:0', grad_fn=) cls_loss: tensor(0.0105, device='cuda:0', grad_fn=) cls_loss: tensor(0.0499, device='cuda:0', grad_fn=) cls_loss: tensor(0.0599, device='cuda:0', grad_fn=) cls_loss: tensor(0.0174, device='cuda:0', grad_fn=) cls_loss: tensor(0.0194, device='cuda:0', grad_fn=) cls_loss: tensor(0.0252, device='cuda:0', grad_fn=) cls_loss: tensor(0.0183, device='cuda:0', grad_fn=) cls_loss: tensor(0.0487, device='cuda:0', grad_fn=) cls_loss: tensor(0.0450, device='cuda:0', grad_fn=) cls_loss: tensor(0.0738, device='cuda:0', grad_fn=) cls_loss: tensor(0.0128, device='cuda:0', grad_fn=) cls_loss: tensor(0.0198, device='cuda:0', grad_fn=) cls_loss: tensor(0.0231, device='cuda:0', grad_fn=) cls_loss: tensor(0.1258, device='cuda:0', grad_fn=) cls_loss: tensor(0.0429, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.2483, device='cuda:0', grad_fn=) cls_loss: tensor(0.1792, device='cuda:0', grad_fn=) cls_loss: tensor(0.0203, device='cuda:0', grad_fn=) cls_loss: tensor(0.0058, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.0112, device='cuda:0', grad_fn=) cls_loss: tensor(0.0279, device='cuda:0', grad_fn=) cls_loss: tensor(0.0132, device='cuda:0', grad_fn=) cls_loss: tensor(0.0036, device='cuda:0', grad_fn=) cls_loss: tensor(0.0253, device='cuda:0', grad_fn=) cls_loss: tensor(0.0158, device='cuda:0', grad_fn=) cls_loss: tensor(0.0696, device='cuda:0', grad_fn=) cls_loss: tensor(0.1529, device='cuda:0', grad_fn=) cls_loss: tensor(0.1268, device='cuda:0', grad_fn=) cls_loss: tensor(0.0175, device='cuda:0', grad_fn=) cls_loss: tensor(0.0118, device='cuda:0', grad_fn=) cls_loss: tensor(0.0167, device='cuda:0', grad_fn=) cls_loss: tensor(0.0500, device='cuda:0', grad_fn=) cls_loss: tensor(0.0215, device='cuda:0', grad_fn=) cls_loss: tensor(0.0100, device='cuda:0', grad_fn=) cls_loss: tensor(0.0609, device='cuda:0', grad_fn=) cls_loss: tensor(0.0378, device='cuda:0', grad_fn=) cls_loss: tensor(0.0185, device='cuda:0', grad_fn=) cls_loss: tensor(0.0192, device='cuda:0', grad_fn=) cls_loss: tensor(0.0208, device='cuda:0', grad_fn=) cls_loss: tensor(0.1907, device='cuda:0', grad_fn=) cls_loss: tensor(0.0153, device='cuda:0', grad_fn=) cls_loss: tensor(0.0750, device='cuda:0', grad_fn=) cls_loss: tensor(0.0260, device='cuda:0', grad_fn=) cls_loss: tensor(0.0044, device='cuda:0', grad_fn=) cls_loss: tensor(0.2326, device='cuda:0', grad_fn=) cls_loss: tensor(0.1205, device='cuda:0', grad_fn=) cls_loss: tensor(0.0033, device='cuda:0', grad_fn=) cls_loss: tensor(0.0630, device='cuda:0', grad_fn=) cls_loss: tensor(0.0440, device='cuda:0', grad_fn=) cls_loss: tensor(0.0115, device='cuda:0', grad_fn=) cls_loss: tensor(0.2303, device='cuda:0', grad_fn=) cls_loss: tensor(0.0493, device='cuda:0', grad_fn=) 0.0001 changing lr ---------------------saving model at epoch 13---------------------------------------------------- epoch 13, time 178.67, cls_loss 0.0462 100 cls_loss: tensor(0.0509, device='cuda:0', grad_fn=) cls_loss: tensor(0.0279, device='cuda:0', grad_fn=) cls_loss: tensor(0.0175, device='cuda:0', grad_fn=) cls_loss: tensor(0.0353, device='cuda:0', grad_fn=) cls_loss: tensor(0.0468, device='cuda:0', grad_fn=) cls_loss: tensor(0.0059, device='cuda:0', grad_fn=) cls_loss: tensor(0.0477, device='cuda:0', grad_fn=) cls_loss: tensor(0.0947, device='cuda:0', grad_fn=) cls_loss: tensor(0.0127, device='cuda:0', grad_fn=) cls_loss: tensor(0.0168, device='cuda:0', grad_fn=) cls_loss: tensor(0.0239, device='cuda:0', grad_fn=) cls_loss: tensor(0.1806, device='cuda:0', grad_fn=) cls_loss: tensor(0.0535, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0172, device='cuda:0', grad_fn=) cls_loss: tensor(0.0707, device='cuda:0', grad_fn=) cls_loss: tensor(0.0443, device='cuda:0', grad_fn=) cls_loss: tensor(0.0045, device='cuda:0', grad_fn=) cls_loss: tensor(0.0109, device='cuda:0', grad_fn=) cls_loss: tensor(0.1148, device='cuda:0', grad_fn=) cls_loss: tensor(0.0054, device='cuda:0', grad_fn=) cls_loss: tensor(0.0633, device='cuda:0', grad_fn=) cls_loss: tensor(0.0081, device='cuda:0', grad_fn=) cls_loss: tensor(0.4964, device='cuda:0', grad_fn=) cls_loss: tensor(0.0509, device='cuda:0', grad_fn=) cls_loss: tensor(0.0836, device='cuda:0', grad_fn=) cls_loss: tensor(0.0338, device='cuda:0', grad_fn=) cls_loss: tensor(0.0150, device='cuda:0', grad_fn=) cls_loss: tensor(0.0947, device='cuda:0', grad_fn=) cls_loss: tensor(0.2029, device='cuda:0', grad_fn=) cls_loss: tensor(0.1070, device='cuda:0', grad_fn=) cls_loss: tensor(0.0101, device='cuda:0', grad_fn=) cls_loss: tensor(0.0562, device='cuda:0', grad_fn=) cls_loss: tensor(0.0105, device='cuda:0', grad_fn=) cls_loss: tensor(0.1187, device='cuda:0', grad_fn=) cls_loss: tensor(0.1607, device='cuda:0', grad_fn=) cls_loss: tensor(0.0900, device='cuda:0', grad_fn=) cls_loss: tensor(0.0389, device='cuda:0', grad_fn=) cls_loss: tensor(0.0062, device='cuda:0', grad_fn=) cls_loss: tensor(0.0264, device='cuda:0', grad_fn=) cls_loss: tensor(0.0382, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(0.1485, device='cuda:0', grad_fn=) cls_loss: tensor(0.0143, device='cuda:0', grad_fn=) cls_loss: tensor(0.0139, device='cuda:0', grad_fn=) cls_loss: tensor(0.0327, device='cuda:0', grad_fn=) cls_loss: tensor(0.0473, device='cuda:0', grad_fn=) cls_loss: tensor(0.0585, device='cuda:0', grad_fn=) cls_loss: tensor(0.0093, device='cuda:0', grad_fn=) cls_loss: tensor(0.0146, device='cuda:0', grad_fn=) cls_loss: tensor(0.0038, device='cuda:0', grad_fn=) cls_loss: tensor(0.0042, device='cuda:0', grad_fn=) cls_loss: tensor(0.0621, device='cuda:0', grad_fn=) cls_loss: tensor(0.0078, device='cuda:0', grad_fn=) cls_loss: tensor(0.0161, device='cuda:0', grad_fn=) cls_loss: tensor(0.0165, device='cuda:0', grad_fn=) cls_loss: tensor(0.0566, device='cuda:0', grad_fn=) cls_loss: tensor(0.1576, device='cuda:0', grad_fn=) cls_loss: tensor(0.0677, device='cuda:0', grad_fn=) cls_loss: tensor(0.0452, device='cuda:0', grad_fn=) cls_loss: tensor(0.0451, device='cuda:0', grad_fn=) cls_loss: tensor(0.0100, device='cuda:0', grad_fn=) cls_loss: tensor(0.0180, device='cuda:0', grad_fn=) cls_loss: tensor(0.0424, device='cuda:0', grad_fn=) cls_loss: tensor(0.0099, device='cuda:0', grad_fn=) cls_loss: tensor(0.0083, device='cuda:0', grad_fn=) cls_loss: tensor(0.0232, device='cuda:0', grad_fn=) cls_loss: tensor(0.0763, device='cuda:0', grad_fn=) cls_loss: tensor(0.1343, device='cuda:0', grad_fn=) cls_loss: tensor(0.0671, device='cuda:0', grad_fn=) cls_loss: tensor(0.0033, device='cuda:0', grad_fn=) cls_loss: tensor(0.0727, device='cuda:0', grad_fn=) cls_loss: tensor(0.0535, device='cuda:0', grad_fn=) cls_loss: tensor(0.0039, device='cuda:0', grad_fn=) cls_loss: tensor(0.0164, device='cuda:0', grad_fn=) cls_loss: tensor(0.0499, device='cuda:0', grad_fn=) cls_loss: tensor(0.0318, device='cuda:0', grad_fn=) cls_loss: tensor(0.0069, device='cuda:0', grad_fn=) cls_loss: tensor(0.0125, device='cuda:0', grad_fn=) cls_loss: tensor(0.0203, device='cuda:0', grad_fn=) cls_loss: tensor(0.0063, device='cuda:0', grad_fn=) cls_loss: tensor(0.0113, device='cuda:0', grad_fn=) cls_loss: tensor(0.2392, device='cuda:0', grad_fn=) cls_loss: tensor(0.1182, device='cuda:0', grad_fn=) cls_loss: tensor(0.1140, device='cuda:0', grad_fn=) cls_loss: tensor(0.0048, device='cuda:0', grad_fn=) cls_loss: tensor(0.0070, device='cuda:0', grad_fn=) cls_loss: tensor(0.0116, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(0.0254, device='cuda:0', grad_fn=) cls_loss: tensor(0.0052, device='cuda:0', grad_fn=) cls_loss: tensor(0.0705, device='cuda:0', grad_fn=) cls_loss: tensor(0.0240, device='cuda:0', grad_fn=) cls_loss: tensor(0.0238, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(0.1812, device='cuda:0', grad_fn=) cls_loss: tensor(0.0608, device='cuda:0', grad_fn=) cls_loss: tensor(0.0404, device='cuda:0', grad_fn=) cls_loss: tensor(0.0107, device='cuda:0', grad_fn=) cls_loss: tensor(0.0143, device='cuda:0', grad_fn=) 0.0001 changing lr epoch 14, time 179.05, cls_loss 0.0505 100 cls_loss: tensor(0.0564, device='cuda:0', grad_fn=) cls_loss: tensor(0.0105, device='cuda:0', grad_fn=) cls_loss: tensor(0.1742, device='cuda:0', grad_fn=) cls_loss: tensor(0.0147, device='cuda:0', grad_fn=) cls_loss: tensor(0.0135, device='cuda:0', grad_fn=) cls_loss: tensor(0.0223, device='cuda:0', grad_fn=) cls_loss: tensor(0.0300, device='cuda:0', grad_fn=) cls_loss: tensor(0.0354, device='cuda:0', grad_fn=) cls_loss: tensor(0.0033, device='cuda:0', grad_fn=) cls_loss: tensor(0.0049, device='cuda:0', grad_fn=) cls_loss: tensor(0.0166, device='cuda:0', grad_fn=) cls_loss: tensor(0.0071, device='cuda:0', grad_fn=) cls_loss: tensor(0.0064, device='cuda:0', grad_fn=) cls_loss: tensor(0.0118, device='cuda:0', grad_fn=) cls_loss: tensor(0.0506, device='cuda:0', grad_fn=) cls_loss: tensor(0.0267, device='cuda:0', grad_fn=) cls_loss: tensor(0.0319, device='cuda:0', grad_fn=) cls_loss: tensor(0.0250, device='cuda:0', grad_fn=) cls_loss: tensor(0.2549, device='cuda:0', grad_fn=) cls_loss: tensor(0.0917, device='cuda:0', grad_fn=) cls_loss: tensor(0.0285, device='cuda:0', grad_fn=) cls_loss: tensor(0.0294, device='cuda:0', grad_fn=) cls_loss: tensor(0.0125, device='cuda:0', grad_fn=) cls_loss: tensor(0.0260, device='cuda:0', grad_fn=) cls_loss: tensor(0.0166, device='cuda:0', grad_fn=) cls_loss: tensor(0.0077, device='cuda:0', grad_fn=) cls_loss: tensor(0.0124, device='cuda:0', grad_fn=) cls_loss: tensor(0.0103, device='cuda:0', grad_fn=) cls_loss: tensor(0.0075, device='cuda:0', grad_fn=) cls_loss: tensor(0.0165, device='cuda:0', grad_fn=) cls_loss: tensor(0.0365, device='cuda:0', grad_fn=) cls_loss: tensor(0.0343, device='cuda:0', grad_fn=) cls_loss: tensor(0.0254, device='cuda:0', grad_fn=) cls_loss: tensor(0.0176, device='cuda:0', grad_fn=) cls_loss: tensor(0.0030, device='cuda:0', grad_fn=) cls_loss: tensor(0.0478, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0695, device='cuda:0', grad_fn=) cls_loss: tensor(0.0726, device='cuda:0', grad_fn=) cls_loss: tensor(0.0549, device='cuda:0', grad_fn=) cls_loss: tensor(0.1766, device='cuda:0', grad_fn=) cls_loss: tensor(0.1515, device='cuda:0', grad_fn=) cls_loss: tensor(0.0232, device='cuda:0', grad_fn=) cls_loss: tensor(0.0076, device='cuda:0', grad_fn=) cls_loss: tensor(0.0036, device='cuda:0', grad_fn=) cls_loss: tensor(0.0375, device='cuda:0', grad_fn=) cls_loss: tensor(0.0644, device='cuda:0', grad_fn=) cls_loss: tensor(0.0129, device='cuda:0', grad_fn=) cls_loss: tensor(0.0096, device='cuda:0', grad_fn=) cls_loss: tensor(0.0762, device='cuda:0', grad_fn=) cls_loss: tensor(0.0929, device='cuda:0', grad_fn=) cls_loss: tensor(0.0079, device='cuda:0', grad_fn=) cls_loss: tensor(0.0046, device='cuda:0', grad_fn=) cls_loss: tensor(0.0248, device='cuda:0', grad_fn=) cls_loss: tensor(0.0095, device='cuda:0', grad_fn=) cls_loss: tensor(0.0110, device='cuda:0', grad_fn=) cls_loss: tensor(0.0285, device='cuda:0', grad_fn=) cls_loss: tensor(0.0525, device='cuda:0', grad_fn=) cls_loss: tensor(0.0634, device='cuda:0', grad_fn=) cls_loss: tensor(0.1577, device='cuda:0', grad_fn=) cls_loss: tensor(0.0332, device='cuda:0', grad_fn=) cls_loss: tensor(0.1593, device='cuda:0', grad_fn=) cls_loss: tensor(0.0049, device='cuda:0', grad_fn=) cls_loss: tensor(0.0817, device='cuda:0', grad_fn=) cls_loss: tensor(0.0329, device='cuda:0', grad_fn=) cls_loss: tensor(0.0564, device='cuda:0', grad_fn=) cls_loss: tensor(0.0365, device='cuda:0', grad_fn=) cls_loss: tensor(0.0414, device='cuda:0', grad_fn=) cls_loss: tensor(0.0445, device='cuda:0', grad_fn=) cls_loss: tensor(0.0667, device='cuda:0', grad_fn=) cls_loss: tensor(0.0102, device='cuda:0', grad_fn=) cls_loss: tensor(0.0118, device='cuda:0', grad_fn=) cls_loss: tensor(0.0254, device='cuda:0', grad_fn=) cls_loss: tensor(0.1123, device='cuda:0', grad_fn=) cls_loss: tensor(0.0635, device='cuda:0', grad_fn=) cls_loss: tensor(0.0084, device='cuda:0', grad_fn=) cls_loss: tensor(0.0161, device='cuda:0', grad_fn=) cls_loss: tensor(0.0297, device='cuda:0', grad_fn=) cls_loss: tensor(0.0644, device='cuda:0', grad_fn=) cls_loss: tensor(0.0104, device='cuda:0', grad_fn=) cls_loss: tensor(0.0308, device='cuda:0', grad_fn=) cls_loss: tensor(0.0731, device='cuda:0', grad_fn=) cls_loss: tensor(0.0277, device='cuda:0', grad_fn=) cls_loss: tensor(0.0199, device='cuda:0', grad_fn=) cls_loss: tensor(0.1604, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0.0193, device='cuda:0', grad_fn=) cls_loss: tensor(0.0057, device='cuda:0', grad_fn=) cls_loss: tensor(0.0048, device='cuda:0', grad_fn=) cls_loss: tensor(0.0211, device='cuda:0', grad_fn=) cls_loss: tensor(0.0066, device='cuda:0', grad_fn=) cls_loss: tensor(0.0244, device='cuda:0', grad_fn=) cls_loss: tensor(0.0209, device='cuda:0', grad_fn=) cls_loss: tensor(0.0086, device='cuda:0', grad_fn=) cls_loss: tensor(0.1255, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(0.1676, device='cuda:0', grad_fn=) cls_loss: tensor(0.0035, device='cuda:0', grad_fn=) cls_loss: tensor(0.0102, device='cuda:0', grad_fn=) cls_loss: tensor(0.0171, device='cuda:0', grad_fn=) 0.0001 changing lr epoch 15, time 179.05, cls_loss 0.0410 100 cls_loss: tensor(0.0254, device='cuda:0', grad_fn=) cls_loss: tensor(0.0065, device='cuda:0', grad_fn=) cls_loss: tensor(0.0354, device='cuda:0', grad_fn=) cls_loss: tensor(0.0499, device='cuda:0', grad_fn=) cls_loss: tensor(0.0054, device='cuda:0', grad_fn=) cls_loss: tensor(0.0462, device='cuda:0', grad_fn=) cls_loss: tensor(0.0245, device='cuda:0', grad_fn=) cls_loss: tensor(0.0065, device='cuda:0', grad_fn=) cls_loss: tensor(0.0663, device='cuda:0', grad_fn=) cls_loss: tensor(0.0055, device='cuda:0', grad_fn=) cls_loss: tensor(0.0891, device='cuda:0', grad_fn=) cls_loss: tensor(0.1743, device='cuda:0', grad_fn=) cls_loss: tensor(0.0699, device='cuda:0', grad_fn=) cls_loss: tensor(0.0535, device='cuda:0', grad_fn=) cls_loss: tensor(0.0933, device='cuda:0', grad_fn=) cls_loss: tensor(0.0054, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(0.0055, device='cuda:0', grad_fn=) cls_loss: tensor(0.0658, device='cuda:0', grad_fn=) cls_loss: tensor(0.0176, device='cuda:0', grad_fn=) cls_loss: tensor(0.0243, device='cuda:0', grad_fn=) cls_loss: tensor(0.0642, device='cuda:0', grad_fn=) cls_loss: tensor(0.1933, device='cuda:0', grad_fn=) cls_loss: tensor(0.0215, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(0.1086, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(0.0033, device='cuda:0', grad_fn=) cls_loss: tensor(0.0324, device='cuda:0', grad_fn=) cls_loss: tensor(0.0196, device='cuda:0', grad_fn=) cls_loss: tensor(0.0043, device='cuda:0', grad_fn=) cls_loss: tensor(0.0548, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0415, device='cuda:0', grad_fn=) cls_loss: tensor(0.0855, device='cuda:0', grad_fn=) cls_loss: tensor(0.0521, device='cuda:0', grad_fn=) cls_loss: tensor(0.0072, device='cuda:0', grad_fn=) cls_loss: tensor(0.0358, device='cuda:0', grad_fn=) cls_loss: tensor(0.0088, device='cuda:0', grad_fn=) cls_loss: tensor(0.0172, device='cuda:0', grad_fn=) cls_loss: tensor(0.0156, device='cuda:0', grad_fn=) cls_loss: tensor(0.0536, device='cuda:0', grad_fn=) cls_loss: tensor(0.0407, device='cuda:0', grad_fn=) cls_loss: tensor(0.0358, device='cuda:0', grad_fn=) cls_loss: tensor(0.3182, device='cuda:0', grad_fn=) cls_loss: tensor(0.1338, device='cuda:0', grad_fn=) cls_loss: tensor(0.0289, device='cuda:0', grad_fn=) cls_loss: tensor(0.0478, device='cuda:0', grad_fn=) cls_loss: tensor(0.0081, device='cuda:0', grad_fn=) cls_loss: tensor(0.0241, device='cuda:0', grad_fn=) cls_loss: tensor(0.0312, device='cuda:0', grad_fn=) cls_loss: tensor(0.0070, device='cuda:0', grad_fn=) cls_loss: tensor(0.1240, device='cuda:0', grad_fn=) cls_loss: tensor(0.0459, device='cuda:0', grad_fn=) cls_loss: tensor(0.0173, device='cuda:0', grad_fn=) cls_loss: tensor(0.0171, device='cuda:0', grad_fn=) cls_loss: tensor(0.0292, device='cuda:0', grad_fn=) cls_loss: tensor(0.0088, device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(0.0079, device='cuda:0', grad_fn=) cls_loss: tensor(0.0391, device='cuda:0', grad_fn=) cls_loss: tensor(0.0275, device='cuda:0', grad_fn=) cls_loss: tensor(0.0093, device='cuda:0', grad_fn=) cls_loss: tensor(0.0124, device='cuda:0', grad_fn=) cls_loss: tensor(0.2029, device='cuda:0', grad_fn=) cls_loss: tensor(0.0672, device='cuda:0', grad_fn=) cls_loss: tensor(0.0125, device='cuda:0', grad_fn=) cls_loss: tensor(0.0387, device='cuda:0', grad_fn=) cls_loss: tensor(0.0150, device='cuda:0', grad_fn=) cls_loss: tensor(0.0530, device='cuda:0', grad_fn=) cls_loss: tensor(0.0047, device='cuda:0', grad_fn=) cls_loss: tensor(0.0101, device='cuda:0', grad_fn=) cls_loss: tensor(0.0483, device='cuda:0', grad_fn=) cls_loss: tensor(0.0111, device='cuda:0', grad_fn=) cls_loss: tensor(0.0270, device='cuda:0', grad_fn=) cls_loss: tensor(0.0081, device='cuda:0', grad_fn=) cls_loss: tensor(0.0203, device='cuda:0', grad_fn=) cls_loss: tensor(0.0048, device='cuda:0', grad_fn=) cls_loss: tensor(0.0584, device='cuda:0', grad_fn=) cls_loss: tensor(0.0081, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0.0189, device='cuda:0', grad_fn=) cls_loss: tensor(0.0142, device='cuda:0', grad_fn=) cls_loss: tensor(0.0545, device='cuda:0', grad_fn=) cls_loss: tensor(0.0036, device='cuda:0', grad_fn=) cls_loss: tensor(0.0139, device='cuda:0', grad_fn=) cls_loss: tensor(0.0041, device='cuda:0', grad_fn=) cls_loss: tensor(0.0448, device='cuda:0', grad_fn=) cls_loss: tensor(0.0507, device='cuda:0', grad_fn=) cls_loss: tensor(0.0262, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.0038, device='cuda:0', grad_fn=) cls_loss: tensor(0.0084, device='cuda:0', grad_fn=) cls_loss: tensor(0.0418, device='cuda:0', grad_fn=) cls_loss: tensor(0.0478, device='cuda:0', grad_fn=) cls_loss: tensor(0.0174, device='cuda:0', grad_fn=) cls_loss: tensor(0.0326, device='cuda:0', grad_fn=) cls_loss: tensor(0.0078, device='cuda:0', grad_fn=) cls_loss: tensor(0.0052, device='cuda:0', grad_fn=) cls_loss: tensor(0.0239, device='cuda:0', grad_fn=) 0.0001 changing lr ---------------------saving model at epoch 16---------------------------------------------------- epoch 16, time 179.14, cls_loss 0.0373 100 cls_loss: tensor(0.0038, device='cuda:0', grad_fn=) cls_loss: tensor(0.0069, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(0.0290, device='cuda:0', grad_fn=) cls_loss: tensor(0.0471, device='cuda:0', grad_fn=) cls_loss: tensor(0.0337, device='cuda:0', grad_fn=) cls_loss: tensor(0.0445, device='cuda:0', grad_fn=) cls_loss: tensor(0.0199, device='cuda:0', grad_fn=) cls_loss: tensor(0.0096, device='cuda:0', grad_fn=) cls_loss: tensor(0.1511, device='cuda:0', grad_fn=) cls_loss: tensor(0.0150, device='cuda:0', grad_fn=) cls_loss: tensor(0.0387, device='cuda:0', grad_fn=) cls_loss: tensor(0.0358, device='cuda:0', grad_fn=) cls_loss: tensor(0.1054, device='cuda:0', grad_fn=) cls_loss: tensor(0.0188, device='cuda:0', grad_fn=) cls_loss: tensor(0.0548, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0828, device='cuda:0', grad_fn=) cls_loss: tensor(0.0195, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(0.0124, device='cuda:0', grad_fn=) cls_loss: tensor(0.0212, device='cuda:0', grad_fn=) cls_loss: tensor(0.0117, device='cuda:0', grad_fn=) cls_loss: tensor(0.1486, device='cuda:0', grad_fn=) cls_loss: tensor(0.0209, device='cuda:0', grad_fn=) cls_loss: tensor(0.2272, device='cuda:0', grad_fn=) cls_loss: tensor(0.0048, device='cuda:0', grad_fn=) cls_loss: tensor(0.0157, device='cuda:0', grad_fn=) cls_loss: tensor(0.0060, device='cuda:0', grad_fn=) cls_loss: tensor(0.0042, device='cuda:0', grad_fn=) cls_loss: tensor(0.0077, device='cuda:0', grad_fn=) cls_loss: tensor(0.0074, device='cuda:0', grad_fn=) cls_loss: tensor(0.0212, device='cuda:0', grad_fn=) cls_loss: tensor(0.0092, device='cuda:0', grad_fn=) cls_loss: tensor(0.0482, device='cuda:0', grad_fn=) cls_loss: tensor(0.0941, device='cuda:0', grad_fn=) cls_loss: tensor(0.1565, device='cuda:0', grad_fn=) cls_loss: tensor(0.0255, device='cuda:0', grad_fn=) cls_loss: tensor(0.0389, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(0.0058, device='cuda:0', grad_fn=) cls_loss: tensor(0.0549, device='cuda:0', grad_fn=) cls_loss: tensor(0.0066, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0114, device='cuda:0', grad_fn=) cls_loss: tensor(0.1331, device='cuda:0', grad_fn=) cls_loss: tensor(0.0156, device='cuda:0', grad_fn=) cls_loss: tensor(0.0226, device='cuda:0', grad_fn=) cls_loss: tensor(0.0171, device='cuda:0', grad_fn=) cls_loss: tensor(0.0642, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(0.0544, device='cuda:0', grad_fn=) cls_loss: tensor(0.0059, device='cuda:0', grad_fn=) cls_loss: tensor(0.0081, device='cuda:0', grad_fn=) cls_loss: tensor(0.0190, device='cuda:0', grad_fn=) cls_loss: tensor(0.0249, device='cuda:0', grad_fn=) cls_loss: tensor(0.0303, device='cuda:0', grad_fn=) cls_loss: tensor(0.0060, device='cuda:0', grad_fn=) cls_loss: tensor(0.1239, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0034, device='cuda:0', grad_fn=) cls_loss: tensor(0.0432, device='cuda:0', grad_fn=) cls_loss: tensor(0.0427, device='cuda:0', grad_fn=) cls_loss: tensor(0.0059, device='cuda:0', grad_fn=) cls_loss: tensor(0.0258, device='cuda:0', grad_fn=) cls_loss: tensor(0.0720, device='cuda:0', grad_fn=) cls_loss: tensor(0.0401, device='cuda:0', grad_fn=) cls_loss: tensor(0.0844, device='cuda:0', grad_fn=) cls_loss: tensor(0.0206, device='cuda:0', grad_fn=) cls_loss: tensor(0.0703, device='cuda:0', grad_fn=) cls_loss: tensor(0.0061, device='cuda:0', grad_fn=) cls_loss: tensor(0.0147, device='cuda:0', grad_fn=) cls_loss: tensor(0.1852, device='cuda:0', grad_fn=) cls_loss: tensor(0.0071, device='cuda:0', grad_fn=) cls_loss: tensor(0.0463, device='cuda:0', grad_fn=) cls_loss: tensor(0.0043, device='cuda:0', grad_fn=) cls_loss: tensor(0.0123, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0192, device='cuda:0', grad_fn=) cls_loss: tensor(0.0078, device='cuda:0', grad_fn=) cls_loss: tensor(0.2530, device='cuda:0', grad_fn=) cls_loss: tensor(0.0131, device='cuda:0', grad_fn=) cls_loss: tensor(0.0192, device='cuda:0', grad_fn=) cls_loss: tensor(0.0244, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0591, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(0.0262, device='cuda:0', grad_fn=) cls_loss: tensor(0.0035, device='cuda:0', grad_fn=) cls_loss: tensor(0.0040, device='cuda:0', grad_fn=) cls_loss: tensor(0.0538, device='cuda:0', grad_fn=) cls_loss: tensor(0.1123, device='cuda:0', grad_fn=) cls_loss: tensor(0.0587, device='cuda:0', grad_fn=) cls_loss: tensor(0.0364, device='cuda:0', grad_fn=) cls_loss: tensor(0.0289, device='cuda:0', grad_fn=) cls_loss: tensor(0.0045, device='cuda:0', grad_fn=) cls_loss: tensor(0.0298, device='cuda:0', grad_fn=) cls_loss: tensor(0.0570, device='cuda:0', grad_fn=) cls_loss: tensor(0.0242, device='cuda:0', grad_fn=) 0.0001 changing lr epoch 17, time 179.44, cls_loss 0.0374 100 cls_loss: tensor(0.0167, device='cuda:0', grad_fn=) cls_loss: tensor(0.0762, device='cuda:0', grad_fn=) cls_loss: tensor(0.0105, device='cuda:0', grad_fn=) cls_loss: tensor(0.0074, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0123, device='cuda:0', grad_fn=) cls_loss: tensor(0.0128, device='cuda:0', grad_fn=) cls_loss: tensor(0.0180, device='cuda:0', grad_fn=) cls_loss: tensor(0.0187, device='cuda:0', grad_fn=) cls_loss: tensor(0.0165, device='cuda:0', grad_fn=) cls_loss: tensor(0.0054, device='cuda:0', grad_fn=) cls_loss: tensor(0.0117, device='cuda:0', grad_fn=) cls_loss: tensor(0.0100, device='cuda:0', grad_fn=) cls_loss: tensor(0.0308, device='cuda:0', grad_fn=) cls_loss: tensor(0.0076, device='cuda:0', grad_fn=) cls_loss: tensor(0.0065, device='cuda:0', grad_fn=) cls_loss: tensor(0.0061, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.1265, device='cuda:0', grad_fn=) cls_loss: tensor(0.0042, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.0207, device='cuda:0', grad_fn=) cls_loss: tensor(0.0087, device='cuda:0', grad_fn=) cls_loss: tensor(0.0161, device='cuda:0', grad_fn=) cls_loss: tensor(0.0459, device='cuda:0', grad_fn=) cls_loss: tensor(0.0051, device='cuda:0', grad_fn=) cls_loss: tensor(0.0075, device='cuda:0', grad_fn=) cls_loss: tensor(0.0848, device='cuda:0', grad_fn=) cls_loss: tensor(0.0059, device='cuda:0', grad_fn=) cls_loss: tensor(0.0034, device='cuda:0', grad_fn=) cls_loss: tensor(0.0079, device='cuda:0', grad_fn=) cls_loss: tensor(0.0429, device='cuda:0', grad_fn=) cls_loss: tensor(0.0051, device='cuda:0', grad_fn=) cls_loss: tensor(0.0042, device='cuda:0', grad_fn=) cls_loss: tensor(0.0430, device='cuda:0', grad_fn=) cls_loss: tensor(0.0373, device='cuda:0', grad_fn=) cls_loss: tensor(0.0089, device='cuda:0', grad_fn=) cls_loss: tensor(0.0256, device='cuda:0', grad_fn=) cls_loss: tensor(0.1256, device='cuda:0', grad_fn=) cls_loss: tensor(0.0032, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0.0312, device='cuda:0', grad_fn=) cls_loss: tensor(0.0039, device='cuda:0', grad_fn=) cls_loss: tensor(0.0251, device='cuda:0', grad_fn=) cls_loss: tensor(0.0108, device='cuda:0', grad_fn=) cls_loss: tensor(0.0188, device='cuda:0', grad_fn=) cls_loss: tensor(0.0106, device='cuda:0', grad_fn=) cls_loss: tensor(0.0070, device='cuda:0', grad_fn=) cls_loss: tensor(0.0361, device='cuda:0', grad_fn=) cls_loss: tensor(0.0531, device='cuda:0', grad_fn=) cls_loss: tensor(0.1345, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(0.0082, device='cuda:0', grad_fn=) cls_loss: tensor(0.0255, device='cuda:0', grad_fn=) cls_loss: tensor(0.0069, device='cuda:0', grad_fn=) cls_loss: tensor(0.0732, device='cuda:0', grad_fn=) cls_loss: tensor(0.0136, device='cuda:0', grad_fn=) cls_loss: tensor(0.0061, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0106, device='cuda:0', grad_fn=) cls_loss: tensor(0.0285, device='cuda:0', grad_fn=) cls_loss: tensor(0.0055, device='cuda:0', grad_fn=) cls_loss: tensor(0.1169, device='cuda:0', grad_fn=) cls_loss: tensor(0.0609, device='cuda:0', grad_fn=) cls_loss: tensor(0.0274, device='cuda:0', grad_fn=) cls_loss: tensor(0.0042, device='cuda:0', grad_fn=) cls_loss: tensor(0.1519, device='cuda:0', grad_fn=) cls_loss: tensor(0.0088, device='cuda:0', grad_fn=) cls_loss: tensor(0.0849, device='cuda:0', grad_fn=) cls_loss: tensor(0.0081, device='cuda:0', grad_fn=) cls_loss: tensor(0.0061, device='cuda:0', grad_fn=) cls_loss: tensor(0.0070, device='cuda:0', grad_fn=) cls_loss: tensor(0.1946, device='cuda:0', grad_fn=) cls_loss: tensor(0.0067, device='cuda:0', grad_fn=) cls_loss: tensor(0.0204, device='cuda:0', grad_fn=) cls_loss: tensor(0.0102, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0265, device='cuda:0', grad_fn=) cls_loss: tensor(0.0211, device='cuda:0', grad_fn=) cls_loss: tensor(0.0507, device='cuda:0', grad_fn=) cls_loss: tensor(0.0076, device='cuda:0', grad_fn=) cls_loss: tensor(0.0363, device='cuda:0', grad_fn=) cls_loss: tensor(0.0068, device='cuda:0', grad_fn=) cls_loss: tensor(0.1137, device='cuda:0', grad_fn=) cls_loss: tensor(0.1377, device='cuda:0', grad_fn=) cls_loss: tensor(0.0686, device='cuda:0', grad_fn=) cls_loss: tensor(0.0131, device='cuda:0', grad_fn=) cls_loss: tensor(0.0179, device='cuda:0', grad_fn=) cls_loss: tensor(0.0187, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(0.0375, device='cuda:0', grad_fn=) cls_loss: tensor(0.0029, device='cuda:0', grad_fn=) cls_loss: tensor(0.0033, device='cuda:0', grad_fn=) cls_loss: tensor(0.0046, device='cuda:0', grad_fn=) cls_loss: tensor(0.2488, device='cuda:0', grad_fn=) cls_loss: tensor(0.0031, device='cuda:0', grad_fn=) cls_loss: tensor(0.0090, device='cuda:0', grad_fn=) cls_loss: tensor(0.0413, device='cuda:0', grad_fn=) cls_loss: tensor(0.0343, device='cuda:0', grad_fn=) cls_loss: tensor(0.0045, device='cuda:0', grad_fn=) 0.0001 changing lr epoch 18, time 179.42, cls_loss 0.0304 100 cls_loss: tensor(0.0070, device='cuda:0', grad_fn=) cls_loss: tensor(0.0108, device='cuda:0', grad_fn=) cls_loss: tensor(0.0113, device='cuda:0', grad_fn=) cls_loss: tensor(0.0069, device='cuda:0', grad_fn=) cls_loss: tensor(0.0225, device='cuda:0', grad_fn=) cls_loss: tensor(0.0699, device='cuda:0', grad_fn=) cls_loss: tensor(0.0119, device='cuda:0', grad_fn=) cls_loss: tensor(0.0140, device='cuda:0', grad_fn=) cls_loss: tensor(0.0407, device='cuda:0', grad_fn=) cls_loss: tensor(0.0218, device='cuda:0', grad_fn=) cls_loss: tensor(0.0103, device='cuda:0', grad_fn=) cls_loss: tensor(0.1202, device='cuda:0', grad_fn=) cls_loss: tensor(0.1141, device='cuda:0', grad_fn=) cls_loss: tensor(0.0076, device='cuda:0', grad_fn=) cls_loss: tensor(0.0035, device='cuda:0', grad_fn=) cls_loss: tensor(0.0050, device='cuda:0', grad_fn=) cls_loss: tensor(0.0064, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.1610, device='cuda:0', grad_fn=) cls_loss: tensor(0.1774, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0030, device='cuda:0', grad_fn=) cls_loss: tensor(0.1218, device='cuda:0', grad_fn=) cls_loss: tensor(0.0429, device='cuda:0', grad_fn=) cls_loss: tensor(0.0246, device='cuda:0', grad_fn=) cls_loss: tensor(0.0209, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(0.1199, device='cuda:0', grad_fn=) cls_loss: tensor(0.0065, device='cuda:0', grad_fn=) cls_loss: tensor(0.0100, device='cuda:0', grad_fn=) cls_loss: tensor(0.0554, device='cuda:0', grad_fn=) cls_loss: tensor(0.0143, device='cuda:0', grad_fn=) cls_loss: tensor(0.0449, device='cuda:0', grad_fn=) cls_loss: tensor(0.0285, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0069, device='cuda:0', grad_fn=) cls_loss: tensor(0.0087, device='cuda:0', grad_fn=) cls_loss: tensor(0.0723, device='cuda:0', grad_fn=) cls_loss: tensor(0.0178, device='cuda:0', grad_fn=) cls_loss: tensor(0.0059, device='cuda:0', grad_fn=) cls_loss: tensor(0.1635, device='cuda:0', grad_fn=) cls_loss: tensor(0.0165, device='cuda:0', grad_fn=) cls_loss: tensor(0.0187, device='cuda:0', grad_fn=) cls_loss: tensor(0.0114, device='cuda:0', grad_fn=) cls_loss: tensor(0.0132, device='cuda:0', grad_fn=) cls_loss: tensor(0.0279, device='cuda:0', grad_fn=) cls_loss: tensor(0.0980, device='cuda:0', grad_fn=) cls_loss: tensor(0.1425, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0124, device='cuda:0', grad_fn=) cls_loss: tensor(0.0419, device='cuda:0', grad_fn=) cls_loss: tensor(0.0033, device='cuda:0', grad_fn=) cls_loss: tensor(0.0039, device='cuda:0', grad_fn=) cls_loss: tensor(0.0409, device='cuda:0', grad_fn=) cls_loss: tensor(0.0586, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0656, device='cuda:0', grad_fn=) cls_loss: tensor(0.1072, device='cuda:0', grad_fn=) cls_loss: tensor(0.0107, device='cuda:0', grad_fn=) cls_loss: tensor(0.0566, device='cuda:0', grad_fn=) cls_loss: tensor(0.0929, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(0.0069, device='cuda:0', grad_fn=) cls_loss: tensor(0.0183, device='cuda:0', grad_fn=) cls_loss: tensor(0.0563, device='cuda:0', grad_fn=) cls_loss: tensor(0.0367, device='cuda:0', grad_fn=) cls_loss: tensor(0.0082, device='cuda:0', grad_fn=) cls_loss: tensor(0.0167, device='cuda:0', grad_fn=) cls_loss: tensor(0.0109, device='cuda:0', grad_fn=) cls_loss: tensor(0.0061, device='cuda:0', grad_fn=) cls_loss: tensor(0.1519, device='cuda:0', grad_fn=) cls_loss: tensor(0.0333, device='cuda:0', grad_fn=) cls_loss: tensor(0.0303, device='cuda:0', grad_fn=) cls_loss: tensor(0.0753, device='cuda:0', grad_fn=) cls_loss: tensor(0.0127, device='cuda:0', grad_fn=) cls_loss: tensor(0.0142, device='cuda:0', grad_fn=) cls_loss: tensor(0.0158, device='cuda:0', grad_fn=) cls_loss: tensor(0.0904, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0088, device='cuda:0', grad_fn=) cls_loss: tensor(0.0058, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0.0742, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0730, device='cuda:0', grad_fn=) cls_loss: tensor(0.0077, device='cuda:0', grad_fn=) cls_loss: tensor(0.0057, device='cuda:0', grad_fn=) cls_loss: tensor(0.1861, device='cuda:0', grad_fn=) cls_loss: tensor(0.0236, device='cuda:0', grad_fn=) cls_loss: tensor(0.1829, device='cuda:0', grad_fn=) cls_loss: tensor(0.0156, device='cuda:0', grad_fn=) cls_loss: tensor(0.0175, device='cuda:0', grad_fn=) cls_loss: tensor(0.0146, device='cuda:0', grad_fn=) cls_loss: tensor(0.0112, device='cuda:0', grad_fn=) cls_loss: tensor(0.1086, device='cuda:0', grad_fn=) cls_loss: tensor(0.0062, device='cuda:0', grad_fn=) 0.0001 changing lr epoch 19, time 179.24, cls_loss 0.0376 100 cls_loss: tensor(0.0070, device='cuda:0', grad_fn=) cls_loss: tensor(0.0093, device='cuda:0', grad_fn=) cls_loss: tensor(0.0449, device='cuda:0', grad_fn=) cls_loss: tensor(0.0437, device='cuda:0', grad_fn=) cls_loss: tensor(0.0214, device='cuda:0', grad_fn=) cls_loss: tensor(0.0055, device='cuda:0', grad_fn=) cls_loss: tensor(0.0230, device='cuda:0', grad_fn=) cls_loss: tensor(0.0164, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(0.0114, device='cuda:0', grad_fn=) cls_loss: tensor(0.0245, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0.0742, device='cuda:0', grad_fn=) cls_loss: tensor(0.0201, device='cuda:0', grad_fn=) cls_loss: tensor(0.0078, device='cuda:0', grad_fn=) cls_loss: tensor(0.0046, device='cuda:0', grad_fn=) cls_loss: tensor(0.1284, device='cuda:0', grad_fn=) cls_loss: tensor(0.0034, device='cuda:0', grad_fn=) cls_loss: tensor(0.0048, device='cuda:0', grad_fn=) cls_loss: tensor(0.0058, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0328, device='cuda:0', grad_fn=) cls_loss: tensor(0.0086, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(0.0045, device='cuda:0', grad_fn=) cls_loss: tensor(0.0440, device='cuda:0', grad_fn=) cls_loss: tensor(0.0115, device='cuda:0', grad_fn=) cls_loss: tensor(0.0199, device='cuda:0', grad_fn=) cls_loss: tensor(0.0581, device='cuda:0', grad_fn=) cls_loss: tensor(0.0182, device='cuda:0', grad_fn=) cls_loss: tensor(0.0452, device='cuda:0', grad_fn=) cls_loss: tensor(0.0055, device='cuda:0', grad_fn=) cls_loss: tensor(0.0243, device='cuda:0', grad_fn=) cls_loss: tensor(0.0051, device='cuda:0', grad_fn=) cls_loss: tensor(0.0260, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.1909, device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(0.0560, device='cuda:0', grad_fn=) cls_loss: tensor(0.0208, device='cuda:0', grad_fn=) cls_loss: tensor(0.0031, device='cuda:0', grad_fn=) cls_loss: tensor(0.0046, device='cuda:0', grad_fn=) cls_loss: tensor(0.0031, device='cuda:0', grad_fn=) cls_loss: tensor(0.0363, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0140, device='cuda:0', grad_fn=) cls_loss: tensor(0.0060, device='cuda:0', grad_fn=) cls_loss: tensor(0.0790, device='cuda:0', grad_fn=) cls_loss: tensor(0.0262, device='cuda:0', grad_fn=) cls_loss: tensor(0.0225, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0054, device='cuda:0', grad_fn=) cls_loss: tensor(0.0087, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0169, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0329, device='cuda:0', grad_fn=) cls_loss: tensor(0.0111, device='cuda:0', grad_fn=) cls_loss: tensor(0.0572, device='cuda:0', grad_fn=) cls_loss: tensor(0.0221, device='cuda:0', grad_fn=) cls_loss: tensor(0.0089, device='cuda:0', grad_fn=) cls_loss: tensor(0.0424, device='cuda:0', grad_fn=) cls_loss: tensor(0.0824, device='cuda:0', grad_fn=) cls_loss: tensor(0.0055, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(0.0123, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.2619, device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(0.0066, device='cuda:0', grad_fn=) cls_loss: tensor(0.0230, device='cuda:0', grad_fn=) cls_loss: tensor(0.0040, device='cuda:0', grad_fn=) cls_loss: tensor(0.0886, device='cuda:0', grad_fn=) cls_loss: tensor(0.0031, device='cuda:0', grad_fn=) cls_loss: tensor(0.1031, device='cuda:0', grad_fn=) cls_loss: tensor(0.0030, device='cuda:0', grad_fn=) cls_loss: tensor(0.0270, device='cuda:0', grad_fn=) cls_loss: tensor(0.0041, device='cuda:0', grad_fn=) cls_loss: tensor(0.0034, device='cuda:0', grad_fn=) cls_loss: tensor(0.0103, device='cuda:0', grad_fn=) cls_loss: tensor(0.1846, device='cuda:0', grad_fn=) cls_loss: tensor(0.1015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0330, device='cuda:0', grad_fn=) cls_loss: tensor(0.0691, device='cuda:0', grad_fn=) cls_loss: tensor(0.0589, device='cuda:0', grad_fn=) cls_loss: tensor(0.0078, device='cuda:0', grad_fn=) cls_loss: tensor(0.0334, device='cuda:0', grad_fn=) cls_loss: tensor(0.0166, device='cuda:0', grad_fn=) cls_loss: tensor(0.0071, device='cuda:0', grad_fn=) cls_loss: tensor(0.0102, device='cuda:0', grad_fn=) cls_loss: tensor(0.0036, device='cuda:0', grad_fn=) cls_loss: tensor(0.0532, device='cuda:0', grad_fn=) cls_loss: tensor(0.0055, device='cuda:0', grad_fn=) cls_loss: tensor(0.0036, device='cuda:0', grad_fn=) cls_loss: tensor(0.0126, device='cuda:0', grad_fn=) cls_loss: tensor(0.0060, device='cuda:0', grad_fn=) cls_loss: tensor(0.1783, device='cuda:0', grad_fn=) cls_loss: tensor(0.0046, device='cuda:0', grad_fn=) cls_loss: tensor(0.0081, device='cuda:0', grad_fn=) 0.0001 changing lr epoch 20, time 179.34, cls_loss 0.0289 100 cls_loss: tensor(0.0130, device='cuda:0', grad_fn=) cls_loss: tensor(0.0063, device='cuda:0', grad_fn=) cls_loss: tensor(0.0222, device='cuda:0', grad_fn=) cls_loss: tensor(0.0047, device='cuda:0', grad_fn=) cls_loss: tensor(0.0100, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0766, device='cuda:0', grad_fn=) cls_loss: tensor(0.0034, device='cuda:0', grad_fn=) cls_loss: tensor(0.0057, device='cuda:0', grad_fn=) cls_loss: tensor(0.0075, device='cuda:0', grad_fn=) cls_loss: tensor(0.0030, device='cuda:0', grad_fn=) cls_loss: tensor(0.0135, device='cuda:0', grad_fn=) cls_loss: tensor(0.0032, device='cuda:0', grad_fn=) cls_loss: tensor(0.0273, device='cuda:0', grad_fn=) cls_loss: tensor(0.0331, device='cuda:0', grad_fn=) cls_loss: tensor(0.0143, device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0135, device='cuda:0', grad_fn=) cls_loss: tensor(0.0054, device='cuda:0', grad_fn=) cls_loss: tensor(0.0242, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.0197, device='cuda:0', grad_fn=) cls_loss: tensor(0.0072, device='cuda:0', grad_fn=) cls_loss: tensor(0.0359, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0079, device='cuda:0', grad_fn=) cls_loss: tensor(0.0032, device='cuda:0', grad_fn=) cls_loss: tensor(0.0544, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.1321, device='cuda:0', grad_fn=) cls_loss: tensor(0.0094, device='cuda:0', grad_fn=) cls_loss: tensor(0.1661, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0229, device='cuda:0', grad_fn=) cls_loss: tensor(0.0104, device='cuda:0', grad_fn=) cls_loss: tensor(0.0218, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(0.0183, device='cuda:0', grad_fn=) cls_loss: tensor(0.0239, device='cuda:0', grad_fn=) cls_loss: tensor(0.0315, device='cuda:0', grad_fn=) cls_loss: tensor(0.0073, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.0035, device='cuda:0', grad_fn=) cls_loss: tensor(0.0033, device='cuda:0', grad_fn=) cls_loss: tensor(0.3797, device='cuda:0', grad_fn=) cls_loss: tensor(0.0280, device='cuda:0', grad_fn=) cls_loss: tensor(0.0156, device='cuda:0', grad_fn=) cls_loss: tensor(0.0723, device='cuda:0', grad_fn=) cls_loss: tensor(0.0107, device='cuda:0', grad_fn=) cls_loss: tensor(0.0092, device='cuda:0', grad_fn=) cls_loss: tensor(0.0174, device='cuda:0', grad_fn=) cls_loss: tensor(0.0142, device='cuda:0', grad_fn=) cls_loss: tensor(0.0249, device='cuda:0', grad_fn=) cls_loss: tensor(0.0136, device='cuda:0', grad_fn=) cls_loss: tensor(0.0046, device='cuda:0', grad_fn=) cls_loss: tensor(0.1802, device='cuda:0', grad_fn=) cls_loss: tensor(0.0420, device='cuda:0', grad_fn=) cls_loss: tensor(0.0744, device='cuda:0', grad_fn=) cls_loss: tensor(0.0074, device='cuda:0', grad_fn=) cls_loss: tensor(0.0172, device='cuda:0', grad_fn=) cls_loss: tensor(0.0283, device='cuda:0', grad_fn=) cls_loss: tensor(0.0489, device='cuda:0', grad_fn=) cls_loss: tensor(0.1041, device='cuda:0', grad_fn=) cls_loss: tensor(0.0113, device='cuda:0', grad_fn=) cls_loss: tensor(0.0937, device='cuda:0', grad_fn=) cls_loss: tensor(0.0074, device='cuda:0', grad_fn=) cls_loss: tensor(0.1323, device='cuda:0', grad_fn=) cls_loss: tensor(0.0088, device='cuda:0', grad_fn=) cls_loss: tensor(0.0142, device='cuda:0', grad_fn=) cls_loss: tensor(0.0296, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0410, device='cuda:0', grad_fn=) cls_loss: tensor(0.0245, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(0.0085, device='cuda:0', grad_fn=) cls_loss: tensor(0.2029, device='cuda:0', grad_fn=) cls_loss: tensor(0.0177, device='cuda:0', grad_fn=) cls_loss: tensor(0.0240, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(0.0261, device='cuda:0', grad_fn=) cls_loss: tensor(0.0474, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0091, device='cuda:0', grad_fn=) cls_loss: tensor(0.0052, device='cuda:0', grad_fn=) cls_loss: tensor(0.0042, device='cuda:0', grad_fn=) cls_loss: tensor(0.0079, device='cuda:0', grad_fn=) cls_loss: tensor(0.0098, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0840, device='cuda:0', grad_fn=) cls_loss: tensor(0.0882, device='cuda:0', grad_fn=) cls_loss: tensor(0.0084, device='cuda:0', grad_fn=) cls_loss: tensor(0.0311, device='cuda:0', grad_fn=) cls_loss: tensor(0.0741, device='cuda:0', grad_fn=) cls_loss: tensor(0.1182, device='cuda:0', grad_fn=) cls_loss: tensor(0.0069, device='cuda:0', grad_fn=) cls_loss: tensor(0.0570, device='cuda:0', grad_fn=) 0.0001 changing lr epoch 21, time 179.59, cls_loss 0.0318 100 cls_loss: tensor(0.0655, device='cuda:0', grad_fn=) cls_loss: tensor(0.0832, device='cuda:0', grad_fn=) cls_loss: tensor(0.0040, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(0.0041, device='cuda:0', grad_fn=) cls_loss: tensor(0.0087, device='cuda:0', grad_fn=) cls_loss: tensor(0.0406, device='cuda:0', grad_fn=) cls_loss: tensor(0.0068, device='cuda:0', grad_fn=) cls_loss: tensor(0.0035, device='cuda:0', grad_fn=) cls_loss: tensor(0.0073, device='cuda:0', grad_fn=) cls_loss: tensor(0.0931, device='cuda:0', grad_fn=) cls_loss: tensor(0.0458, device='cuda:0', grad_fn=) cls_loss: tensor(0.0050, device='cuda:0', grad_fn=) cls_loss: tensor(0.0352, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.1425, device='cuda:0', grad_fn=) cls_loss: tensor(0.0116, device='cuda:0', grad_fn=) cls_loss: tensor(0.0055, device='cuda:0', grad_fn=) cls_loss: tensor(0.0121, device='cuda:0', grad_fn=) cls_loss: tensor(0.0047, device='cuda:0', grad_fn=) cls_loss: tensor(0.0284, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(0.0171, device='cuda:0', grad_fn=) cls_loss: tensor(0.0036, device='cuda:0', grad_fn=) cls_loss: tensor(0.0199, device='cuda:0', grad_fn=) cls_loss: tensor(0.0111, device='cuda:0', grad_fn=) cls_loss: tensor(0.2571, device='cuda:0', grad_fn=) cls_loss: tensor(0.0087, device='cuda:0', grad_fn=) cls_loss: tensor(0.0418, device='cuda:0', grad_fn=) cls_loss: tensor(0.0056, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(0.0110, device='cuda:0', grad_fn=) cls_loss: tensor(0.0310, device='cuda:0', grad_fn=) cls_loss: tensor(0.0410, device='cuda:0', grad_fn=) cls_loss: tensor(0.0074, device='cuda:0', grad_fn=) cls_loss: tensor(0.0176, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0129, device='cuda:0', grad_fn=) cls_loss: tensor(0.0480, device='cuda:0', grad_fn=) cls_loss: tensor(0.1130, device='cuda:0', grad_fn=) cls_loss: tensor(0.0206, device='cuda:0', grad_fn=) cls_loss: tensor(0.0124, device='cuda:0', grad_fn=) cls_loss: tensor(0.0064, device='cuda:0', grad_fn=) cls_loss: tensor(0.0117, device='cuda:0', grad_fn=) cls_loss: tensor(0.0255, device='cuda:0', grad_fn=) cls_loss: tensor(0.0031, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0278, device='cuda:0', grad_fn=) cls_loss: tensor(0.0173, device='cuda:0', grad_fn=) cls_loss: tensor(0.0226, device='cuda:0', grad_fn=) cls_loss: tensor(0.0875, device='cuda:0', grad_fn=) cls_loss: tensor(0.2008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0126, device='cuda:0', grad_fn=) cls_loss: tensor(0.0220, device='cuda:0', grad_fn=) cls_loss: tensor(0.0232, device='cuda:0', grad_fn=) cls_loss: tensor(0.0117, device='cuda:0', grad_fn=) cls_loss: tensor(0.0061, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0198, device='cuda:0', grad_fn=) cls_loss: tensor(0.1142, device='cuda:0', grad_fn=) cls_loss: tensor(0.0058, device='cuda:0', grad_fn=) cls_loss: tensor(0.0061, device='cuda:0', grad_fn=) cls_loss: tensor(0.0041, device='cuda:0', grad_fn=) cls_loss: tensor(0.0258, device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(0.0171, device='cuda:0', grad_fn=) cls_loss: tensor(0.0108, device='cuda:0', grad_fn=) cls_loss: tensor(0.0128, device='cuda:0', grad_fn=) cls_loss: tensor(0.0253, device='cuda:0', grad_fn=) cls_loss: tensor(0.0066, device='cuda:0', grad_fn=) cls_loss: tensor(0.0154, device='cuda:0', grad_fn=) cls_loss: tensor(0.0219, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(0.0055, device='cuda:0', grad_fn=) cls_loss: tensor(0.0063, device='cuda:0', grad_fn=) cls_loss: tensor(0.0208, device='cuda:0', grad_fn=) cls_loss: tensor(0.0325, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(0.0038, device='cuda:0', grad_fn=) cls_loss: tensor(0.0039, device='cuda:0', grad_fn=) cls_loss: tensor(0.0100, device='cuda:0', grad_fn=) cls_loss: tensor(0.0032, device='cuda:0', grad_fn=) cls_loss: tensor(0.0080, device='cuda:0', grad_fn=) cls_loss: tensor(0.0115, device='cuda:0', grad_fn=) cls_loss: tensor(0.0035, device='cuda:0', grad_fn=) cls_loss: tensor(0.0054, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.1717, device='cuda:0', grad_fn=) cls_loss: tensor(0.0156, device='cuda:0', grad_fn=) cls_loss: tensor(0.0119, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.0717, device='cuda:0', grad_fn=) cls_loss: tensor(0.0044, device='cuda:0', grad_fn=) cls_loss: tensor(0.0173, device='cuda:0', grad_fn=) cls_loss: tensor(0.0160, device='cuda:0', grad_fn=) cls_loss: tensor(0.0129, device='cuda:0', grad_fn=) cls_loss: tensor(0.0150, device='cuda:0', grad_fn=) cls_loss: tensor(0.0619, device='cuda:0', grad_fn=) 0.0001 changing lr epoch 22, time 179.67, cls_loss 0.0259 100 cls_loss: tensor(0.1163, device='cuda:0', grad_fn=) cls_loss: tensor(0.0053, device='cuda:0', grad_fn=) cls_loss: tensor(0.0152, device='cuda:0', grad_fn=) cls_loss: tensor(0.0562, device='cuda:0', grad_fn=) cls_loss: tensor(0.0131, device='cuda:0', grad_fn=) cls_loss: tensor(0.0329, device='cuda:0', grad_fn=) cls_loss: tensor(0.0367, device='cuda:0', grad_fn=) cls_loss: tensor(0.0585, device='cuda:0', grad_fn=) cls_loss: tensor(0.0371, device='cuda:0', grad_fn=) cls_loss: tensor(0.0047, device='cuda:0', grad_fn=) cls_loss: tensor(0.0156, device='cuda:0', grad_fn=) cls_loss: tensor(0.0104, device='cuda:0', grad_fn=) cls_loss: tensor(0.0530, device='cuda:0', grad_fn=) cls_loss: tensor(0.0054, device='cuda:0', grad_fn=) cls_loss: tensor(0.0030, device='cuda:0', grad_fn=) cls_loss: tensor(0.0726, device='cuda:0', grad_fn=) cls_loss: tensor(0.0074, device='cuda:0', grad_fn=) cls_loss: tensor(0.0207, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0.0856, device='cuda:0', grad_fn=) cls_loss: tensor(0.0740, device='cuda:0', grad_fn=) cls_loss: tensor(0.0530, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0085, device='cuda:0', grad_fn=) cls_loss: tensor(0.0166, device='cuda:0', grad_fn=) cls_loss: tensor(0.0090, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0058, device='cuda:0', grad_fn=) cls_loss: tensor(0.0038, device='cuda:0', grad_fn=) cls_loss: tensor(0.0465, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(0.0061, device='cuda:0', grad_fn=) cls_loss: tensor(0.0075, device='cuda:0', grad_fn=) cls_loss: tensor(0.0461, device='cuda:0', grad_fn=) cls_loss: tensor(0.0394, device='cuda:0', grad_fn=) cls_loss: tensor(0.0806, device='cuda:0', grad_fn=) cls_loss: tensor(0.0061, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.0204, device='cuda:0', grad_fn=) cls_loss: tensor(0.0501, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0214, device='cuda:0', grad_fn=) cls_loss: tensor(0.0068, device='cuda:0', grad_fn=) cls_loss: tensor(0.0664, device='cuda:0', grad_fn=) cls_loss: tensor(0.0108, device='cuda:0', grad_fn=) cls_loss: tensor(0.0999, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(0.0060, device='cuda:0', grad_fn=) cls_loss: tensor(0.0074, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0038, device='cuda:0', grad_fn=) cls_loss: tensor(0.0095, device='cuda:0', grad_fn=) cls_loss: tensor(0.0087, device='cuda:0', grad_fn=) cls_loss: tensor(0.0090, device='cuda:0', grad_fn=) cls_loss: tensor(0.0182, device='cuda:0', grad_fn=) cls_loss: tensor(0.0127, device='cuda:0', grad_fn=) cls_loss: tensor(0.0061, device='cuda:0', grad_fn=) cls_loss: tensor(0.0359, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.1654, device='cuda:0', grad_fn=) cls_loss: tensor(0.0071, device='cuda:0', grad_fn=) cls_loss: tensor(0.0036, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0.0052, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0367, device='cuda:0', grad_fn=) cls_loss: tensor(0.0106, device='cuda:0', grad_fn=) cls_loss: tensor(0.0059, device='cuda:0', grad_fn=) cls_loss: tensor(0.0076, device='cuda:0', grad_fn=) cls_loss: tensor(0.0139, device='cuda:0', grad_fn=) cls_loss: tensor(0.0031, device='cuda:0', grad_fn=) cls_loss: tensor(0.0054, device='cuda:0', grad_fn=) cls_loss: tensor(0.0313, device='cuda:0', grad_fn=) cls_loss: tensor(0.0074, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0039, device='cuda:0', grad_fn=) cls_loss: tensor(0.0087, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0036, device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(0.0243, device='cuda:0', grad_fn=) cls_loss: tensor(0.0052, device='cuda:0', grad_fn=) cls_loss: tensor(0.0390, device='cuda:0', grad_fn=) cls_loss: tensor(0.0132, device='cuda:0', grad_fn=) cls_loss: tensor(0.0099, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0600, device='cuda:0', grad_fn=) cls_loss: tensor(0.0465, device='cuda:0', grad_fn=) cls_loss: tensor(0.0153, device='cuda:0', grad_fn=) cls_loss: tensor(0.0092, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0382, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.0828, device='cuda:0', grad_fn=) 0.0001 changing lr ---------------------saving model at epoch 23---------------------------------------------------- epoch 23, time 179.59, cls_loss 0.0215 100 cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0105, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0038, device='cuda:0', grad_fn=) cls_loss: tensor(0.0775, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0266, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0032, device='cuda:0', grad_fn=) cls_loss: tensor(0.0395, device='cuda:0', grad_fn=) cls_loss: tensor(0.0031, device='cuda:0', grad_fn=) cls_loss: tensor(0.0458, device='cuda:0', grad_fn=) cls_loss: tensor(0.0528, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0379, device='cuda:0', grad_fn=) cls_loss: tensor(0.0067, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(5.7273e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0708, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0029, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0060, device='cuda:0', grad_fn=) cls_loss: tensor(0.0077, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0037, device='cuda:0', grad_fn=) cls_loss: tensor(0.0366, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.0251, device='cuda:0', grad_fn=) cls_loss: tensor(0.0065, device='cuda:0', grad_fn=) cls_loss: tensor(0.1686, device='cuda:0', grad_fn=) cls_loss: tensor(0.0061, device='cuda:0', grad_fn=) cls_loss: tensor(0.0603, device='cuda:0', grad_fn=) cls_loss: tensor(0.0868, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0056, device='cuda:0', grad_fn=) cls_loss: tensor(0.0385, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0052, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0064, device='cuda:0', grad_fn=) cls_loss: tensor(0.0688, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0066, device='cuda:0', grad_fn=) cls_loss: tensor(0.0178, device='cuda:0', grad_fn=) cls_loss: tensor(0.0065, device='cuda:0', grad_fn=) cls_loss: tensor(0.0100, device='cuda:0', grad_fn=) cls_loss: tensor(0.0341, device='cuda:0', grad_fn=) cls_loss: tensor(0.0038, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0096, device='cuda:0', grad_fn=) cls_loss: tensor(0.0313, device='cuda:0', grad_fn=) cls_loss: tensor(0.0446, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0323, device='cuda:0', grad_fn=) cls_loss: tensor(0.0234, device='cuda:0', grad_fn=) cls_loss: tensor(0.0769, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0284, device='cuda:0', grad_fn=) cls_loss: tensor(0.0680, device='cuda:0', grad_fn=) cls_loss: tensor(0.0204, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0272, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(0.0619, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0312, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(0.0154, device='cuda:0', grad_fn=) cls_loss: tensor(0.0045, device='cuda:0', grad_fn=) cls_loss: tensor(0.0041, device='cuda:0', grad_fn=) cls_loss: tensor(0.0105, device='cuda:0', grad_fn=) cls_loss: tensor(0.0060, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.0305, device='cuda:0', grad_fn=) cls_loss: tensor(0.0142, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(0.0030, device='cuda:0', grad_fn=) cls_loss: tensor(0.0093, device='cuda:0', grad_fn=) cls_loss: tensor(0.0086, device='cuda:0', grad_fn=) cls_loss: tensor(0.0061, device='cuda:0', grad_fn=) cls_loss: tensor(0.0409, device='cuda:0', grad_fn=) cls_loss: tensor(0.1724, device='cuda:0', grad_fn=) cls_loss: tensor(0.0370, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(0.0062, device='cuda:0', grad_fn=) cls_loss: tensor(0.0041, device='cuda:0', grad_fn=) cls_loss: tensor(0.0058, device='cuda:0', grad_fn=) cls_loss: tensor(0.0153, device='cuda:0', grad_fn=) cls_loss: tensor(0.0773, device='cuda:0', grad_fn=) cls_loss: tensor(0.0168, device='cuda:0', grad_fn=) cls_loss: tensor(0.0047, device='cuda:0', grad_fn=) cls_loss: tensor(0.0276, device='cuda:0', grad_fn=) cls_loss: tensor(0.0493, device='cuda:0', grad_fn=) cls_loss: tensor(0.0464, device='cuda:0', grad_fn=) cls_loss: tensor(0.0552, device='cuda:0', grad_fn=) cls_loss: tensor(0.0163, device='cuda:0', grad_fn=) 0.0001 changing lr epoch 24, time 179.65, cls_loss 0.0217 100 cls_loss: tensor(0.2044, device='cuda:0', grad_fn=) cls_loss: tensor(0.1273, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(0.0301, device='cuda:0', grad_fn=) cls_loss: tensor(0.0214, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0085, device='cuda:0', grad_fn=) cls_loss: tensor(0.0042, device='cuda:0', grad_fn=) cls_loss: tensor(0.0539, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(0.1722, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(0.0117, device='cuda:0', grad_fn=) cls_loss: tensor(0.0603, device='cuda:0', grad_fn=) cls_loss: tensor(0.0264, device='cuda:0', grad_fn=) cls_loss: tensor(0.0206, device='cuda:0', grad_fn=) cls_loss: tensor(0.0096, device='cuda:0', grad_fn=) cls_loss: tensor(0.0325, device='cuda:0', grad_fn=) cls_loss: tensor(0.0176, device='cuda:0', grad_fn=) cls_loss: tensor(0.0043, device='cuda:0', grad_fn=) cls_loss: tensor(0.0066, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0038, device='cuda:0', grad_fn=) cls_loss: tensor(0.0077, device='cuda:0', grad_fn=) cls_loss: tensor(0.0144, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(0.0225, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(0.0084, device='cuda:0', grad_fn=) cls_loss: tensor(0.0086, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0175, device='cuda:0', grad_fn=) cls_loss: tensor(0.0047, device='cuda:0', grad_fn=) cls_loss: tensor(0.0030, device='cuda:0', grad_fn=) cls_loss: tensor(0.0039, device='cuda:0', grad_fn=) cls_loss: tensor(0.0030, device='cuda:0', grad_fn=) cls_loss: tensor(0.0113, device='cuda:0', grad_fn=) cls_loss: tensor(0.0039, device='cuda:0', grad_fn=) cls_loss: tensor(0.0073, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0150, device='cuda:0', grad_fn=) cls_loss: tensor(0.0096, device='cuda:0', grad_fn=) cls_loss: tensor(0.0046, device='cuda:0', grad_fn=) cls_loss: tensor(0.0091, device='cuda:0', grad_fn=) cls_loss: tensor(0.0105, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0577, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0358, device='cuda:0', grad_fn=) cls_loss: tensor(0.0135, device='cuda:0', grad_fn=) cls_loss: tensor(0.0052, device='cuda:0', grad_fn=) cls_loss: tensor(0.1736, device='cuda:0', grad_fn=) cls_loss: tensor(0.0048, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0272, device='cuda:0', grad_fn=) cls_loss: tensor(0.0174, device='cuda:0', grad_fn=) cls_loss: tensor(0.0489, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.1104, device='cuda:0', grad_fn=) cls_loss: tensor(0.0213, device='cuda:0', grad_fn=) cls_loss: tensor(0.0595, device='cuda:0', grad_fn=) cls_loss: tensor(0.0209, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0802, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(0.0976, device='cuda:0', grad_fn=) cls_loss: tensor(0.0075, device='cuda:0', grad_fn=) cls_loss: tensor(0.0072, device='cuda:0', grad_fn=) cls_loss: tensor(0.0038, device='cuda:0', grad_fn=) cls_loss: tensor(0.0080, device='cuda:0', grad_fn=) cls_loss: tensor(0.0194, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(0.0037, device='cuda:0', grad_fn=) cls_loss: tensor(0.0279, device='cuda:0', grad_fn=) cls_loss: tensor(0.0838, device='cuda:0', grad_fn=) cls_loss: tensor(0.0321, device='cuda:0', grad_fn=) cls_loss: tensor(0.0062, device='cuda:0', grad_fn=) cls_loss: tensor(0.0052, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0047, device='cuda:0', grad_fn=) cls_loss: tensor(0.0075, device='cuda:0', grad_fn=) cls_loss: tensor(0.0220, device='cuda:0', grad_fn=) cls_loss: tensor(0.0053, device='cuda:0', grad_fn=) cls_loss: tensor(0.0048, device='cuda:0', grad_fn=) cls_loss: tensor(0.0057, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.0159, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(0.0309, device='cuda:0', grad_fn=) cls_loss: tensor(0.0339, device='cuda:0', grad_fn=) cls_loss: tensor(0.0051, device='cuda:0', grad_fn=) cls_loss: tensor(0.0284, device='cuda:0', grad_fn=) cls_loss: tensor(0.0214, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) 0.0001 changing lr epoch 25, time 179.39, cls_loss 0.0219 100 cls_loss: tensor(0.0235, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(0.0068, device='cuda:0', grad_fn=) cls_loss: tensor(0.0047, device='cuda:0', grad_fn=) cls_loss: tensor(0.0058, device='cuda:0', grad_fn=) cls_loss: tensor(0.0071, device='cuda:0', grad_fn=) cls_loss: tensor(0.0684, device='cuda:0', grad_fn=) cls_loss: tensor(0.0066, device='cuda:0', grad_fn=) cls_loss: tensor(0.0198, device='cuda:0', grad_fn=) cls_loss: tensor(0.0360, device='cuda:0', grad_fn=) cls_loss: tensor(0.0430, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0362, device='cuda:0', grad_fn=) cls_loss: tensor(0.0046, device='cuda:0', grad_fn=) cls_loss: tensor(0.1085, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(0.0078, device='cuda:0', grad_fn=) cls_loss: tensor(0.0132, device='cuda:0', grad_fn=) cls_loss: tensor(0.0659, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0072, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(0.0118, device='cuda:0', grad_fn=) cls_loss: tensor(0.0065, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0037, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0194, device='cuda:0', grad_fn=) cls_loss: tensor(0.0279, device='cuda:0', grad_fn=) cls_loss: tensor(0.0698, device='cuda:0', grad_fn=) cls_loss: tensor(0.0120, device='cuda:0', grad_fn=) cls_loss: tensor(0.0095, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.0069, device='cuda:0', grad_fn=) cls_loss: tensor(0.0339, device='cuda:0', grad_fn=) cls_loss: tensor(0.0420, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0307, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0043, device='cuda:0', grad_fn=) cls_loss: tensor(0.0036, device='cuda:0', grad_fn=) cls_loss: tensor(0.0052, device='cuda:0', grad_fn=) cls_loss: tensor(0.0175, device='cuda:0', grad_fn=) cls_loss: tensor(0.0090, device='cuda:0', grad_fn=) cls_loss: tensor(0.0110, device='cuda:0', grad_fn=) cls_loss: tensor(0.0613, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0144, device='cuda:0', grad_fn=) cls_loss: tensor(0.0029, device='cuda:0', grad_fn=) cls_loss: tensor(0.0274, device='cuda:0', grad_fn=) cls_loss: tensor(0.0120, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(0.0212, device='cuda:0', grad_fn=) cls_loss: tensor(0.0030, device='cuda:0', grad_fn=) cls_loss: tensor(0.0058, device='cuda:0', grad_fn=) cls_loss: tensor(0.0094, device='cuda:0', grad_fn=) cls_loss: tensor(0.0737, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0101, device='cuda:0', grad_fn=) cls_loss: tensor(0.0043, device='cuda:0', grad_fn=) cls_loss: tensor(0.0153, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.2524, device='cuda:0', grad_fn=) cls_loss: tensor(0.0146, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0599, device='cuda:0', grad_fn=) cls_loss: tensor(0.0129, device='cuda:0', grad_fn=) cls_loss: tensor(0.0156, device='cuda:0', grad_fn=) cls_loss: tensor(0.0885, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0036, device='cuda:0', grad_fn=) cls_loss: tensor(0.0223, device='cuda:0', grad_fn=) cls_loss: tensor(0.0299, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0045, device='cuda:0', grad_fn=) cls_loss: tensor(0.0853, device='cuda:0', grad_fn=) cls_loss: tensor(0.0220, device='cuda:0', grad_fn=) cls_loss: tensor(0.0041, device='cuda:0', grad_fn=) cls_loss: tensor(0.0434, device='cuda:0', grad_fn=) cls_loss: tensor(0.0371, device='cuda:0', grad_fn=) cls_loss: tensor(0.0077, device='cuda:0', grad_fn=) cls_loss: tensor(0.0160, device='cuda:0', grad_fn=) cls_loss: tensor(0.0521, device='cuda:0', grad_fn=) cls_loss: tensor(0.0190, device='cuda:0', grad_fn=) cls_loss: tensor(0.0057, device='cuda:0', grad_fn=) cls_loss: tensor(0.0118, device='cuda:0', grad_fn=) cls_loss: tensor(0.0050, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0433, device='cuda:0', grad_fn=) cls_loss: tensor(0.0050, device='cuda:0', grad_fn=) cls_loss: tensor(0.0047, device='cuda:0', grad_fn=) cls_loss: tensor(0.0203, device='cuda:0', grad_fn=) 0.0001 changing lr epoch 26, time 179.46, cls_loss 0.0198 100 cls_loss: tensor(0.1287, device='cuda:0', grad_fn=) cls_loss: tensor(0.0171, device='cuda:0', grad_fn=) cls_loss: tensor(0.0054, device='cuda:0', grad_fn=) cls_loss: tensor(0.0083, device='cuda:0', grad_fn=) cls_loss: tensor(0.0252, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0265, device='cuda:0', grad_fn=) cls_loss: tensor(0.0215, device='cuda:0', grad_fn=) cls_loss: tensor(0.0139, device='cuda:0', grad_fn=) cls_loss: tensor(0.0081, device='cuda:0', grad_fn=) cls_loss: tensor(0.1329, device='cuda:0', grad_fn=) cls_loss: tensor(0.0194, device='cuda:0', grad_fn=) cls_loss: tensor(0.0306, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.0179, device='cuda:0', grad_fn=) cls_loss: tensor(0.1044, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0146, device='cuda:0', grad_fn=) cls_loss: tensor(0.0088, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0370, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0094, device='cuda:0', grad_fn=) cls_loss: tensor(0.0093, device='cuda:0', grad_fn=) cls_loss: tensor(0.0044, device='cuda:0', grad_fn=) cls_loss: tensor(0.0835, device='cuda:0', grad_fn=) cls_loss: tensor(0.0030, device='cuda:0', grad_fn=) cls_loss: tensor(0.0138, device='cuda:0', grad_fn=) cls_loss: tensor(0.0549, device='cuda:0', grad_fn=) cls_loss: tensor(0.0039, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0234, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(0.0350, device='cuda:0', grad_fn=) cls_loss: tensor(0.0677, device='cuda:0', grad_fn=) cls_loss: tensor(0.0147, device='cuda:0', grad_fn=) cls_loss: tensor(0.0160, device='cuda:0', grad_fn=) cls_loss: tensor(0.0539, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(0.0049, device='cuda:0', grad_fn=) cls_loss: tensor(0.0252, device='cuda:0', grad_fn=) cls_loss: tensor(0.0316, device='cuda:0', grad_fn=) cls_loss: tensor(0.0264, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0097, device='cuda:0', grad_fn=) cls_loss: tensor(0.0132, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(0.0042, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0082, device='cuda:0', grad_fn=) cls_loss: tensor(0.0165, device='cuda:0', grad_fn=) cls_loss: tensor(0.0074, device='cuda:0', grad_fn=) cls_loss: tensor(0.0120, device='cuda:0', grad_fn=) cls_loss: tensor(0.1248, device='cuda:0', grad_fn=) cls_loss: tensor(0.0034, device='cuda:0', grad_fn=) cls_loss: tensor(0.0143, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0525, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0062, device='cuda:0', grad_fn=) cls_loss: tensor(0.0076, device='cuda:0', grad_fn=) cls_loss: tensor(0.0119, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0048, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(0.1334, device='cuda:0', grad_fn=) cls_loss: tensor(0.0055, device='cuda:0', grad_fn=) cls_loss: tensor(0.1504, device='cuda:0', grad_fn=) cls_loss: tensor(0.0283, device='cuda:0', grad_fn=) cls_loss: tensor(0.0066, device='cuda:0', grad_fn=) cls_loss: tensor(0.0030, device='cuda:0', grad_fn=) cls_loss: tensor(0.0086, device='cuda:0', grad_fn=) cls_loss: tensor(0.0033, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0129, device='cuda:0', grad_fn=) cls_loss: tensor(0.0974, device='cuda:0', grad_fn=) cls_loss: tensor(0.0108, device='cuda:0', grad_fn=) cls_loss: tensor(0.0103, device='cuda:0', grad_fn=) cls_loss: tensor(0.0326, device='cuda:0', grad_fn=) cls_loss: tensor(0.0719, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0330, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0317, device='cuda:0', grad_fn=) cls_loss: tensor(0.0058, device='cuda:0', grad_fn=) cls_loss: tensor(0.0531, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0034, device='cuda:0', grad_fn=) cls_loss: tensor(0.0041, device='cuda:0', grad_fn=) cls_loss: tensor(0.0153, device='cuda:0', grad_fn=) cls_loss: tensor(0.0111, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0032, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) 0.0001 changing lr epoch 27, time 178.94, cls_loss 0.0217 100 cls_loss: tensor(0.0034, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0067, device='cuda:0', grad_fn=) cls_loss: tensor(0.0064, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0083, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0632, device='cuda:0', grad_fn=) cls_loss: tensor(0.0245, device='cuda:0', grad_fn=) cls_loss: tensor(0.0225, device='cuda:0', grad_fn=) cls_loss: tensor(0.0042, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0180, device='cuda:0', grad_fn=) cls_loss: tensor(0.1084, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0984, device='cuda:0', grad_fn=) cls_loss: tensor(0.0584, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(0.0066, device='cuda:0', grad_fn=) cls_loss: tensor(0.0610, device='cuda:0', grad_fn=) cls_loss: tensor(0.0036, device='cuda:0', grad_fn=) cls_loss: tensor(0.0131, device='cuda:0', grad_fn=) cls_loss: tensor(0.0033, device='cuda:0', grad_fn=) cls_loss: tensor(0.0079, device='cuda:0', grad_fn=) cls_loss: tensor(0.0150, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0279, device='cuda:0', grad_fn=) cls_loss: tensor(0.0148, device='cuda:0', grad_fn=) cls_loss: tensor(0.1277, device='cuda:0', grad_fn=) cls_loss: tensor(0.0208, device='cuda:0', grad_fn=) cls_loss: tensor(0.0162, device='cuda:0', grad_fn=) cls_loss: tensor(0.0275, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.0038, device='cuda:0', grad_fn=) cls_loss: tensor(0.0067, device='cuda:0', grad_fn=) cls_loss: tensor(0.0376, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0076, device='cuda:0', grad_fn=) cls_loss: tensor(0.0176, device='cuda:0', grad_fn=) cls_loss: tensor(0.0080, device='cuda:0', grad_fn=) cls_loss: tensor(0.0208, device='cuda:0', grad_fn=) cls_loss: tensor(0.0772, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(0.0169, device='cuda:0', grad_fn=) cls_loss: tensor(0.0033, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0090, device='cuda:0', grad_fn=) cls_loss: tensor(0.0279, device='cuda:0', grad_fn=) cls_loss: tensor(0.0121, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0117, device='cuda:0', grad_fn=) cls_loss: tensor(0.1231, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0220, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.0067, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0.0103, device='cuda:0', grad_fn=) cls_loss: tensor(0.1259, device='cuda:0', grad_fn=) cls_loss: tensor(0.0057, device='cuda:0', grad_fn=) cls_loss: tensor(0.0196, device='cuda:0', grad_fn=) cls_loss: tensor(0.0707, device='cuda:0', grad_fn=) cls_loss: tensor(0.0051, device='cuda:0', grad_fn=) cls_loss: tensor(0.0200, device='cuda:0', grad_fn=) cls_loss: tensor(0.0109, device='cuda:0', grad_fn=) cls_loss: tensor(0.0601, device='cuda:0', grad_fn=) cls_loss: tensor(0.0780, device='cuda:0', grad_fn=) cls_loss: tensor(0.0228, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0084, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(0.0102, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0248, device='cuda:0', grad_fn=) cls_loss: tensor(0.0107, device='cuda:0', grad_fn=) cls_loss: tensor(0.0047, device='cuda:0', grad_fn=) cls_loss: tensor(0.0029, device='cuda:0', grad_fn=) cls_loss: tensor(0.0652, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(0.0048, device='cuda:0', grad_fn=) cls_loss: tensor(0.0229, device='cuda:0', grad_fn=) cls_loss: tensor(0.0078, device='cuda:0', grad_fn=) cls_loss: tensor(0.0046, device='cuda:0', grad_fn=) cls_loss: tensor(0.0130, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0596, device='cuda:0', grad_fn=) cls_loss: tensor(0.0053, device='cuda:0', grad_fn=) cls_loss: tensor(0.0335, device='cuda:0', grad_fn=) cls_loss: tensor(0.0039, device='cuda:0', grad_fn=) 0.0001 changing lr epoch 28, time 178.77, cls_loss 0.0193 100 cls_loss: tensor(0.0109, device='cuda:0', grad_fn=) cls_loss: tensor(0.0080, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(0.1153, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(0.0081, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0102, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0158, device='cuda:0', grad_fn=) cls_loss: tensor(0.0030, device='cuda:0', grad_fn=) cls_loss: tensor(0.0092, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0410, device='cuda:0', grad_fn=) cls_loss: tensor(0.0072, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0.0127, device='cuda:0', grad_fn=) cls_loss: tensor(0.0185, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(0.0488, device='cuda:0', grad_fn=) cls_loss: tensor(0.0355, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0059, device='cuda:0', grad_fn=) cls_loss: tensor(0.1195, device='cuda:0', grad_fn=) cls_loss: tensor(0.0033, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0.0448, device='cuda:0', grad_fn=) cls_loss: tensor(0.0121, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(0.0304, device='cuda:0', grad_fn=) cls_loss: tensor(0.0037, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0051, device='cuda:0', grad_fn=) cls_loss: tensor(0.0058, device='cuda:0', grad_fn=) cls_loss: tensor(0.0109, device='cuda:0', grad_fn=) cls_loss: tensor(0.0451, device='cuda:0', grad_fn=) cls_loss: tensor(0.0480, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(0.0077, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0138, device='cuda:0', grad_fn=) cls_loss: tensor(0.0104, device='cuda:0', grad_fn=) cls_loss: tensor(0.0198, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0119, device='cuda:0', grad_fn=) cls_loss: tensor(0.1077, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0504, device='cuda:0', grad_fn=) cls_loss: tensor(0.0122, device='cuda:0', grad_fn=) cls_loss: tensor(0.0403, device='cuda:0', grad_fn=) cls_loss: tensor(0.0031, device='cuda:0', grad_fn=) cls_loss: tensor(0.0045, device='cuda:0', grad_fn=) cls_loss: tensor(0.0588, device='cuda:0', grad_fn=) cls_loss: tensor(0.0521, device='cuda:0', grad_fn=) cls_loss: tensor(0.0261, device='cuda:0', grad_fn=) cls_loss: tensor(0.0230, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0111, device='cuda:0', grad_fn=) cls_loss: tensor(0.1447, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(0.0616, device='cuda:0', grad_fn=) cls_loss: tensor(0.0588, device='cuda:0', grad_fn=) cls_loss: tensor(0.0063, device='cuda:0', grad_fn=) cls_loss: tensor(0.0854, device='cuda:0', grad_fn=) cls_loss: tensor(0.0699, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0172, device='cuda:0', grad_fn=) cls_loss: tensor(0.0329, device='cuda:0', grad_fn=) cls_loss: tensor(0.1276, device='cuda:0', grad_fn=) cls_loss: tensor(0.0051, device='cuda:0', grad_fn=) cls_loss: tensor(0.0118, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0140, device='cuda:0', grad_fn=) cls_loss: tensor(0.0993, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0611, device='cuda:0', grad_fn=) cls_loss: tensor(0.1195, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0051, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.0044, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0146, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0.0043, device='cuda:0', grad_fn=) cls_loss: tensor(0.0182, device='cuda:0', grad_fn=) cls_loss: tensor(0.0102, device='cuda:0', grad_fn=) 0.0001 changing lr epoch 29, time 178.99, cls_loss 0.0216 100 cls_loss: tensor(0.0459, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0057, device='cuda:0', grad_fn=) cls_loss: tensor(0.0166, device='cuda:0', grad_fn=) cls_loss: tensor(0.0056, device='cuda:0', grad_fn=) cls_loss: tensor(0.0043, device='cuda:0', grad_fn=) cls_loss: tensor(0.0080, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.1007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0076, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(0.0032, device='cuda:0', grad_fn=) cls_loss: tensor(0.0233, device='cuda:0', grad_fn=) cls_loss: tensor(0.0080, device='cuda:0', grad_fn=) cls_loss: tensor(0.0457, device='cuda:0', grad_fn=) cls_loss: tensor(0.0208, device='cuda:0', grad_fn=) cls_loss: tensor(0.0096, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(0.0112, device='cuda:0', grad_fn=) cls_loss: tensor(0.0134, device='cuda:0', grad_fn=) cls_loss: tensor(0.0052, device='cuda:0', grad_fn=) cls_loss: tensor(0.0044, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(0.0098, device='cuda:0', grad_fn=) cls_loss: tensor(0.0097, device='cuda:0', grad_fn=) cls_loss: tensor(0.0045, device='cuda:0', grad_fn=) cls_loss: tensor(0.0089, device='cuda:0', grad_fn=) cls_loss: tensor(0.0189, device='cuda:0', grad_fn=) cls_loss: tensor(0.0054, device='cuda:0', grad_fn=) cls_loss: tensor(0.0131, device='cuda:0', grad_fn=) cls_loss: tensor(0.0400, device='cuda:0', grad_fn=) cls_loss: tensor(0.0079, device='cuda:0', grad_fn=) cls_loss: tensor(0.0189, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0103, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.0036, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0201, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0224, device='cuda:0', grad_fn=) cls_loss: tensor(0.0069, device='cuda:0', grad_fn=) cls_loss: tensor(0.0066, device='cuda:0', grad_fn=) cls_loss: tensor(0.0065, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(0.0107, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0174, device='cuda:0', grad_fn=) cls_loss: tensor(0.0249, device='cuda:0', grad_fn=) cls_loss: tensor(0.0151, device='cuda:0', grad_fn=) cls_loss: tensor(0.0074, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0038, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0030, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0123, device='cuda:0', grad_fn=) cls_loss: tensor(0.2611, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0380, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0047, device='cuda:0', grad_fn=) cls_loss: tensor(0.0273, device='cuda:0', grad_fn=) cls_loss: tensor(0.0148, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0093, device='cuda:0', grad_fn=) cls_loss: tensor(0.0084, device='cuda:0', grad_fn=) cls_loss: tensor(0.0284, device='cuda:0', grad_fn=) cls_loss: tensor(0.0075, device='cuda:0', grad_fn=) cls_loss: tensor(0.0030, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0172, device='cuda:0', grad_fn=) cls_loss: tensor(0.0093, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0044, device='cuda:0', grad_fn=) cls_loss: tensor(0.0041, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0991, device='cuda:0', grad_fn=) cls_loss: tensor(0.0114, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) 0.0001 changing lr epoch 30, time 179.12, cls_loss 0.0125 100 cls_loss: tensor(0.0124, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0248, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0210, device='cuda:0', grad_fn=) cls_loss: tensor(0.0039, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(5.8677e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0064, device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0082, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.0216, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.0085, device='cuda:0', grad_fn=) cls_loss: tensor(0.0692, device='cuda:0', grad_fn=) cls_loss: tensor(0.0032, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(0.1188, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0072, device='cuda:0', grad_fn=) cls_loss: tensor(0.1041, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0040, device='cuda:0', grad_fn=) cls_loss: tensor(0.0041, device='cuda:0', grad_fn=) cls_loss: tensor(0.0049, device='cuda:0', grad_fn=) cls_loss: tensor(0.0067, device='cuda:0', grad_fn=) cls_loss: tensor(0.0579, device='cuda:0', grad_fn=) cls_loss: tensor(0.0043, device='cuda:0', grad_fn=) cls_loss: tensor(0.0047, device='cuda:0', grad_fn=) cls_loss: tensor(0.0305, device='cuda:0', grad_fn=) cls_loss: tensor(0.0032, device='cuda:0', grad_fn=) cls_loss: tensor(0.0043, device='cuda:0', grad_fn=) cls_loss: tensor(0.0265, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.1114, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0126, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.1280, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0319, device='cuda:0', grad_fn=) cls_loss: tensor(0.0614, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0029, device='cuda:0', grad_fn=) cls_loss: tensor(0.0161, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(0.0060, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0329, device='cuda:0', grad_fn=) cls_loss: tensor(0.0035, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0040, device='cuda:0', grad_fn=) cls_loss: tensor(0.0049, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(0.0043, device='cuda:0', grad_fn=) cls_loss: tensor(0.0079, device='cuda:0', grad_fn=) cls_loss: tensor(0.0032, device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(0.0047, device='cuda:0', grad_fn=) cls_loss: tensor(0.0030, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(0.0040, device='cuda:0', grad_fn=) cls_loss: tensor(0.0097, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0101, device='cuda:0', grad_fn=) cls_loss: tensor(0.0032, device='cuda:0', grad_fn=) cls_loss: tensor(0.0072, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0481, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0037, device='cuda:0', grad_fn=) cls_loss: tensor(0.0141, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0165, device='cuda:0', grad_fn=) cls_loss: tensor(0.0035, device='cuda:0', grad_fn=) cls_loss: tensor(0.0110, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0040, device='cuda:0', grad_fn=) cls_loss: tensor(0.0246, device='cuda:0', grad_fn=) 0.0001 changing lr epoch 31, time 178.96, cls_loss 0.0122 100 cls_loss: tensor(0.0035, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0034, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0314, device='cuda:0', grad_fn=) cls_loss: tensor(0.0534, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0242, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(9.6131e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0072, device='cuda:0', grad_fn=) cls_loss: tensor(0.0114, device='cuda:0', grad_fn=) cls_loss: tensor(0.0233, device='cuda:0', grad_fn=) cls_loss: tensor(0.0035, device='cuda:0', grad_fn=) cls_loss: tensor(0.0901, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(0.0050, device='cuda:0', grad_fn=) cls_loss: tensor(0.0255, device='cuda:0', grad_fn=) cls_loss: tensor(0.0079, device='cuda:0', grad_fn=) cls_loss: tensor(0.0196, device='cuda:0', grad_fn=) cls_loss: tensor(0.1137, device='cuda:0', grad_fn=) cls_loss: tensor(0.0187, device='cuda:0', grad_fn=) cls_loss: tensor(0.0124, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0029, device='cuda:0', grad_fn=) cls_loss: tensor(0.0068, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0936, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(0.0077, device='cuda:0', grad_fn=) cls_loss: tensor(0.0067, device='cuda:0', grad_fn=) cls_loss: tensor(0.0214, device='cuda:0', grad_fn=) cls_loss: tensor(0.0861, device='cuda:0', grad_fn=) cls_loss: tensor(0.0121, device='cuda:0', grad_fn=) cls_loss: tensor(0.0213, device='cuda:0', grad_fn=) cls_loss: tensor(0.0065, device='cuda:0', grad_fn=) cls_loss: tensor(0.0876, device='cuda:0', grad_fn=) cls_loss: tensor(0.0037, device='cuda:0', grad_fn=) cls_loss: tensor(0.0030, device='cuda:0', grad_fn=) cls_loss: tensor(0.0045, device='cuda:0', grad_fn=) cls_loss: tensor(0.0063, device='cuda:0', grad_fn=) cls_loss: tensor(0.0054, device='cuda:0', grad_fn=) cls_loss: tensor(0.0110, device='cuda:0', grad_fn=) cls_loss: tensor(0.0399, device='cuda:0', grad_fn=) cls_loss: tensor(0.1067, device='cuda:0', grad_fn=) cls_loss: tensor(0.0150, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0036, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0030, device='cuda:0', grad_fn=) cls_loss: tensor(0.0566, device='cuda:0', grad_fn=) cls_loss: tensor(0.1016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0269, device='cuda:0', grad_fn=) cls_loss: tensor(0.0187, device='cuda:0', grad_fn=) cls_loss: tensor(0.0484, device='cuda:0', grad_fn=) cls_loss: tensor(0.0118, device='cuda:0', grad_fn=) cls_loss: tensor(0.0094, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0065, device='cuda:0', grad_fn=) cls_loss: tensor(0.0223, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0030, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0094, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0143, device='cuda:0', grad_fn=) cls_loss: tensor(0.0361, device='cuda:0', grad_fn=) cls_loss: tensor(0.0121, device='cuda:0', grad_fn=) cls_loss: tensor(0.0104, device='cuda:0', grad_fn=) cls_loss: tensor(0.0057, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0872, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(0.0048, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0095, device='cuda:0', grad_fn=) cls_loss: tensor(0.0844, device='cuda:0', grad_fn=) cls_loss: tensor(0.0071, device='cuda:0', grad_fn=) cls_loss: tensor(0.0121, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0184, device='cuda:0', grad_fn=) 0.0001 changing lr ---------------------saving model at epoch 32---------------------------------------------------- epoch 32, time 179.24, cls_loss 0.0167 100 cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0407, device='cuda:0', grad_fn=) cls_loss: tensor(0.0030, device='cuda:0', grad_fn=) cls_loss: tensor(0.0050, device='cuda:0', grad_fn=) cls_loss: tensor(0.0038, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0065, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0.0112, device='cuda:0', grad_fn=) cls_loss: tensor(0.0231, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0047, device='cuda:0', grad_fn=) cls_loss: tensor(0.0070, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0.0176, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0372, device='cuda:0', grad_fn=) cls_loss: tensor(0.0033, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0032, device='cuda:0', grad_fn=) cls_loss: tensor(0.0378, device='cuda:0', grad_fn=) cls_loss: tensor(0.0783, device='cuda:0', grad_fn=) cls_loss: tensor(0.0326, device='cuda:0', grad_fn=) cls_loss: tensor(0.0188, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0032, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0193, device='cuda:0', grad_fn=) cls_loss: tensor(0.0084, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0038, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.0336, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(0.0928, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0065, device='cuda:0', grad_fn=) cls_loss: tensor(0.0585, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0031, device='cuda:0', grad_fn=) cls_loss: tensor(0.0058, device='cuda:0', grad_fn=) cls_loss: tensor(0.0242, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0077, device='cuda:0', grad_fn=) cls_loss: tensor(0.0042, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0620, device='cuda:0', grad_fn=) cls_loss: tensor(0.0245, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0.0166, device='cuda:0', grad_fn=) cls_loss: tensor(0.0657, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0116, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0061, device='cuda:0', grad_fn=) cls_loss: tensor(0.0227, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0034, device='cuda:0', grad_fn=) cls_loss: tensor(0.0034, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0036, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0080, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0253, device='cuda:0', grad_fn=) cls_loss: tensor(0.0057, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0036, device='cuda:0', grad_fn=) cls_loss: tensor(0.0145, device='cuda:0', grad_fn=) cls_loss: tensor(0.0224, device='cuda:0', grad_fn=) cls_loss: tensor(0.0096, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(5.4602e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) 0.0001 changing lr ---------------------saving model at epoch 33---------------------------------------------------- epoch 33, time 179.48, cls_loss 0.0097 100 cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0118, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0226, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0166, device='cuda:0', grad_fn=) cls_loss: tensor(0.0862, device='cuda:0', grad_fn=) cls_loss: tensor(0.0043, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0040, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0231, device='cuda:0', grad_fn=) cls_loss: tensor(0.0032, device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(0.0029, device='cuda:0', grad_fn=) cls_loss: tensor(0.0080, device='cuda:0', grad_fn=) cls_loss: tensor(0.0034, device='cuda:0', grad_fn=) cls_loss: tensor(0.0120, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0173, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0036, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0160, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0141, device='cuda:0', grad_fn=) cls_loss: tensor(0.0120, device='cuda:0', grad_fn=) cls_loss: tensor(0.0219, device='cuda:0', grad_fn=) cls_loss: tensor(0.0058, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0338, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0055, device='cuda:0', grad_fn=) cls_loss: tensor(0.0113, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0054, device='cuda:0', grad_fn=) cls_loss: tensor(0.0358, device='cuda:0', grad_fn=) cls_loss: tensor(0.0063, device='cuda:0', grad_fn=) cls_loss: tensor(0.0743, device='cuda:0', grad_fn=) cls_loss: tensor(0.0076, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(0.0125, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(0.0057, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(5.7831e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0034, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.1101, device='cuda:0', grad_fn=) cls_loss: tensor(0.0343, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0083, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0295, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0042, device='cuda:0', grad_fn=) cls_loss: tensor(0.0926, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0076, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(0.0073, device='cuda:0', grad_fn=) cls_loss: tensor(0.0047, device='cuda:0', grad_fn=) cls_loss: tensor(0.0036, device='cuda:0', grad_fn=) cls_loss: tensor(0.0057, device='cuda:0', grad_fn=) cls_loss: tensor(0.0046, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0109, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0216, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(0.0126, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0038, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0104, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0601, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) 0.0001 changing lr ---------------------saving model at epoch 34---------------------------------------------------- epoch 34, time 179.46, cls_loss 0.0098 100 cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0058, device='cuda:0', grad_fn=) cls_loss: tensor(0.0055, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0126, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0049, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(6.6597e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0234, device='cuda:0', grad_fn=) cls_loss: tensor(0.0175, device='cuda:0', grad_fn=) cls_loss: tensor(0.0074, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.0642, device='cuda:0', grad_fn=) cls_loss: tensor(0.0684, device='cuda:0', grad_fn=) cls_loss: tensor(0.0060, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0036, device='cuda:0', grad_fn=) cls_loss: tensor(0.0110, device='cuda:0', grad_fn=) cls_loss: tensor(0.0224, device='cuda:0', grad_fn=) cls_loss: tensor(0.0195, device='cuda:0', grad_fn=) cls_loss: tensor(0.0265, device='cuda:0', grad_fn=) cls_loss: tensor(0.0096, device='cuda:0', grad_fn=) cls_loss: tensor(0.0069, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0038, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0158, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0030, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0639, device='cuda:0', grad_fn=) cls_loss: tensor(0.0046, device='cuda:0', grad_fn=) cls_loss: tensor(6.2361e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0060, device='cuda:0', grad_fn=) cls_loss: tensor(0.0344, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(9.1821e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0190, device='cuda:0', grad_fn=) cls_loss: tensor(0.0133, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0172, device='cuda:0', grad_fn=) cls_loss: tensor(0.0071, device='cuda:0', grad_fn=) cls_loss: tensor(0.0103, device='cuda:0', grad_fn=) cls_loss: tensor(0.1290, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0089, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0422, device='cuda:0', grad_fn=) cls_loss: tensor(0.0032, device='cuda:0', grad_fn=) cls_loss: tensor(0.0093, device='cuda:0', grad_fn=) cls_loss: tensor(0.0297, device='cuda:0', grad_fn=) cls_loss: tensor(0.0101, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(7.1429e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0049, device='cuda:0', grad_fn=) cls_loss: tensor(0.0031, device='cuda:0', grad_fn=) cls_loss: tensor(0.0037, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0065, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0616, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(0.2367, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0268, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0047, device='cuda:0', grad_fn=) cls_loss: tensor(0.0118, device='cuda:0', grad_fn=) 0.0001 changing lr epoch 35, time 179.65, cls_loss 0.0115 100 cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0045, device='cuda:0', grad_fn=) cls_loss: tensor(0.0320, device='cuda:0', grad_fn=) cls_loss: tensor(0.0048, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0221, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0705, device='cuda:0', grad_fn=) cls_loss: tensor(0.0039, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0073, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0133, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(0.0539, device='cuda:0', grad_fn=) cls_loss: tensor(0.0714, device='cuda:0', grad_fn=) cls_loss: tensor(0.0114, device='cuda:0', grad_fn=) cls_loss: tensor(0.0405, device='cuda:0', grad_fn=) cls_loss: tensor(0.0067, device='cuda:0', grad_fn=) cls_loss: tensor(0.0106, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0160, device='cuda:0', grad_fn=) cls_loss: tensor(0.0042, device='cuda:0', grad_fn=) cls_loss: tensor(0.0035, device='cuda:0', grad_fn=) cls_loss: tensor(0.0726, device='cuda:0', grad_fn=) cls_loss: tensor(0.0049, device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(0.0215, device='cuda:0', grad_fn=) cls_loss: tensor(0.0043, device='cuda:0', grad_fn=) cls_loss: tensor(0.0083, device='cuda:0', grad_fn=) cls_loss: tensor(0.0546, device='cuda:0', grad_fn=) cls_loss: tensor(0.0034, device='cuda:0', grad_fn=) cls_loss: tensor(0.0202, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(0.0429, device='cuda:0', grad_fn=) cls_loss: tensor(0.0104, device='cuda:0', grad_fn=) cls_loss: tensor(0.0436, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0030, device='cuda:0', grad_fn=) cls_loss: tensor(0.1062, device='cuda:0', grad_fn=) cls_loss: tensor(0.0033, device='cuda:0', grad_fn=) cls_loss: tensor(0.0046, device='cuda:0', grad_fn=) cls_loss: tensor(0.0041, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0157, device='cuda:0', grad_fn=) cls_loss: tensor(0.0052, device='cuda:0', grad_fn=) cls_loss: tensor(0.0689, device='cuda:0', grad_fn=) cls_loss: tensor(0.0035, device='cuda:0', grad_fn=) cls_loss: tensor(0.0223, device='cuda:0', grad_fn=) cls_loss: tensor(0.0558, device='cuda:0', grad_fn=) cls_loss: tensor(0.0608, device='cuda:0', grad_fn=) cls_loss: tensor(0.0089, device='cuda:0', grad_fn=) cls_loss: tensor(0.0321, device='cuda:0', grad_fn=) cls_loss: tensor(0.0396, device='cuda:0', grad_fn=) cls_loss: tensor(0.0137, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0196, device='cuda:0', grad_fn=) cls_loss: tensor(0.0049, device='cuda:0', grad_fn=) cls_loss: tensor(0.1447, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0357, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0130, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(0.0126, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0842, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0609, device='cuda:0', grad_fn=) cls_loss: tensor(0.0195, device='cuda:0', grad_fn=) cls_loss: tensor(0.0050, device='cuda:0', grad_fn=) cls_loss: tensor(0.0688, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(0.1163, device='cuda:0', grad_fn=) cls_loss: tensor(0.0137, device='cuda:0', grad_fn=) cls_loss: tensor(0.0034, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0125, device='cuda:0', grad_fn=) cls_loss: tensor(0.0079, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0069, device='cuda:0', grad_fn=) cls_loss: tensor(0.0365, device='cuda:0', grad_fn=) cls_loss: tensor(0.0321, device='cuda:0', grad_fn=) cls_loss: tensor(0.0108, device='cuda:0', grad_fn=) cls_loss: tensor(0.0652, device='cuda:0', grad_fn=) 0.0001 changing lr epoch 36, time 179.32, cls_loss 0.0192 100 cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0099, device='cuda:0', grad_fn=) cls_loss: tensor(0.0101, device='cuda:0', grad_fn=) cls_loss: tensor(0.0388, device='cuda:0', grad_fn=) cls_loss: tensor(0.0155, device='cuda:0', grad_fn=) cls_loss: tensor(0.0038, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0057, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(0.0176, device='cuda:0', grad_fn=) cls_loss: tensor(0.0122, device='cuda:0', grad_fn=) cls_loss: tensor(0.0029, device='cuda:0', grad_fn=) cls_loss: tensor(0.0117, device='cuda:0', grad_fn=) cls_loss: tensor(0.2140, device='cuda:0', grad_fn=) cls_loss: tensor(0.0332, device='cuda:0', grad_fn=) cls_loss: tensor(0.0057, device='cuda:0', grad_fn=) cls_loss: tensor(0.0202, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0036, device='cuda:0', grad_fn=) cls_loss: tensor(6.5736e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0037, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0087, device='cuda:0', grad_fn=) cls_loss: tensor(0.0034, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0043, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(5.8394e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0591, device='cuda:0', grad_fn=) cls_loss: tensor(0.0066, device='cuda:0', grad_fn=) cls_loss: tensor(0.0212, device='cuda:0', grad_fn=) cls_loss: tensor(0.0036, device='cuda:0', grad_fn=) cls_loss: tensor(0.1665, device='cuda:0', grad_fn=) cls_loss: tensor(0.0061, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0565, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0077, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0044, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.1096, device='cuda:0', grad_fn=) cls_loss: tensor(0.0089, device='cuda:0', grad_fn=) cls_loss: tensor(0.0057, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0063, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0031, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(0.0061, device='cuda:0', grad_fn=) cls_loss: tensor(0.1281, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0166, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0095, device='cuda:0', grad_fn=) cls_loss: tensor(0.0043, device='cuda:0', grad_fn=) cls_loss: tensor(0.0046, device='cuda:0', grad_fn=) cls_loss: tensor(0.0067, device='cuda:0', grad_fn=) cls_loss: tensor(0.0619, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0251, device='cuda:0', grad_fn=) cls_loss: tensor(0.1199, device='cuda:0', grad_fn=) cls_loss: tensor(0.0063, device='cuda:0', grad_fn=) cls_loss: tensor(0.0047, device='cuda:0', grad_fn=) cls_loss: tensor(0.0148, device='cuda:0', grad_fn=) cls_loss: tensor(0.0460, device='cuda:0', grad_fn=) cls_loss: tensor(0.0081, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.0054, device='cuda:0', grad_fn=) cls_loss: tensor(0.0214, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.0087, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(0.0053, device='cuda:0', grad_fn=) cls_loss: tensor(0.0052, device='cuda:0', grad_fn=) cls_loss: tensor(0.0943, device='cuda:0', grad_fn=) cls_loss: tensor(0.0334, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0140, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0037, device='cuda:0', grad_fn=) cls_loss: tensor(0.0581, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0038, device='cuda:0', grad_fn=) 0.0001 changing lr epoch 37, time 179.63, cls_loss 0.0165 100 cls_loss: tensor(0.0039, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0113, device='cuda:0', grad_fn=) cls_loss: tensor(0.0181, device='cuda:0', grad_fn=) cls_loss: tensor(0.0095, device='cuda:0', grad_fn=) cls_loss: tensor(0.1373, device='cuda:0', grad_fn=) cls_loss: tensor(0.0199, device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(0.0042, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0040, device='cuda:0', grad_fn=) cls_loss: tensor(0.0062, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(0.0255, device='cuda:0', grad_fn=) cls_loss: tensor(0.0107, device='cuda:0', grad_fn=) cls_loss: tensor(0.0086, device='cuda:0', grad_fn=) cls_loss: tensor(0.0380, device='cuda:0', grad_fn=) cls_loss: tensor(0.0057, device='cuda:0', grad_fn=) cls_loss: tensor(0.0413, device='cuda:0', grad_fn=) cls_loss: tensor(0.0713, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0274, device='cuda:0', grad_fn=) cls_loss: tensor(0.0097, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0794, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(0.0094, device='cuda:0', grad_fn=) cls_loss: tensor(0.0033, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0220, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0043, device='cuda:0', grad_fn=) cls_loss: tensor(0.0713, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0066, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0529, device='cuda:0', grad_fn=) cls_loss: tensor(0.0109, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(6.2954e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0528, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0068, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0074, device='cuda:0', grad_fn=) cls_loss: tensor(0.0055, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0453, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0031, device='cuda:0', grad_fn=) cls_loss: tensor(0.0043, device='cuda:0', grad_fn=) cls_loss: tensor(0.0074, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0035, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0073, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0127, device='cuda:0', grad_fn=) cls_loss: tensor(0.0108, device='cuda:0', grad_fn=) cls_loss: tensor(0.0101, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0036, device='cuda:0', grad_fn=) cls_loss: tensor(0.0040, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0075, device='cuda:0', grad_fn=) cls_loss: tensor(0.0034, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0037, device='cuda:0', grad_fn=) cls_loss: tensor(0.0029, device='cuda:0', grad_fn=) cls_loss: tensor(0.0130, device='cuda:0', grad_fn=) cls_loss: tensor(0.0050, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) 0.0001 changing lr epoch 38, time 179.50, cls_loss 0.0099 100 cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.0048, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0073, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0041, device='cuda:0', grad_fn=) cls_loss: tensor(0.0104, device='cuda:0', grad_fn=) cls_loss: tensor(0.0581, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(9.7398e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0035, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0051, device='cuda:0', grad_fn=) cls_loss: tensor(0.0128, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0175, device='cuda:0', grad_fn=) cls_loss: tensor(0.0101, device='cuda:0', grad_fn=) cls_loss: tensor(0.0055, device='cuda:0', grad_fn=) cls_loss: tensor(0.0032, device='cuda:0', grad_fn=) cls_loss: tensor(2.9884e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(9.6917e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(2.0579e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0195, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(4.3571e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0032, device='cuda:0', grad_fn=) cls_loss: tensor(0.0061, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0674, device='cuda:0', grad_fn=) cls_loss: tensor(0.0046, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(9.9961e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0089, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0032, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(5.9765e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.1208, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0404, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0058, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0473, device='cuda:0', grad_fn=) cls_loss: tensor(0.0987, device='cuda:0', grad_fn=) cls_loss: tensor(0.0096, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(3.7368e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(0.0065, device='cuda:0', grad_fn=) cls_loss: tensor(0.0180, device='cuda:0', grad_fn=) cls_loss: tensor(0.0053, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(0.0073, device='cuda:0', grad_fn=) cls_loss: tensor(0.0046, device='cuda:0', grad_fn=) cls_loss: tensor(0.0057, device='cuda:0', grad_fn=) cls_loss: tensor(0.1593, device='cuda:0', grad_fn=) cls_loss: tensor(0.0054, device='cuda:0', grad_fn=) cls_loss: tensor(0.0130, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.0056, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(0.0029, device='cuda:0', grad_fn=) cls_loss: tensor(0.0136, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0052, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0854, device='cuda:0', grad_fn=) cls_loss: tensor(0.0063, device='cuda:0', grad_fn=) cls_loss: tensor(0.0142, device='cuda:0', grad_fn=) cls_loss: tensor(0.0038, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) 0.0001 changing lr epoch 39, time 178.62, cls_loss 0.0099 100 cls_loss: tensor(0.1314, device='cuda:0', grad_fn=) cls_loss: tensor(0.0095, device='cuda:0', grad_fn=) cls_loss: tensor(0.0080, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0116, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0037, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0421, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(6.4593e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0038, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0053, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0143, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0056, device='cuda:0', grad_fn=) cls_loss: tensor(0.0054, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.0077, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(4.8034e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0094, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0079, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0052, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0054, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0045, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(3.3297e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0108, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0053, device='cuda:0', grad_fn=) cls_loss: tensor(0.0170, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(4.0457e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0065, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(4.2915e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.4548e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) 0.0001 changing lr epoch 40, time 178.17, cls_loss 0.0038 100 cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(0.0073, device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0109, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0048, device='cuda:0', grad_fn=) cls_loss: tensor(0.0256, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0042, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.1299, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0117, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(0.0032, device='cuda:0', grad_fn=) cls_loss: tensor(0.0816, device='cuda:0', grad_fn=) cls_loss: tensor(0.1590, device='cuda:0', grad_fn=) cls_loss: tensor(4.7971e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0040, device='cuda:0', grad_fn=) cls_loss: tensor(0.0070, device='cuda:0', grad_fn=) cls_loss: tensor(0.0525, device='cuda:0', grad_fn=) cls_loss: tensor(0.0042, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0060, device='cuda:0', grad_fn=) cls_loss: tensor(0.0029, device='cuda:0', grad_fn=) cls_loss: tensor(0.0712, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0038, device='cuda:0', grad_fn=) cls_loss: tensor(5.1595e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.6367e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.2572, device='cuda:0', grad_fn=) cls_loss: tensor(0.0049, device='cuda:0', grad_fn=) cls_loss: tensor(0.0034, device='cuda:0', grad_fn=) cls_loss: tensor(0.0031, device='cuda:0', grad_fn=) cls_loss: tensor(0.0231, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.0466, device='cuda:0', grad_fn=) cls_loss: tensor(0.0034, device='cuda:0', grad_fn=) cls_loss: tensor(0.0034, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(0.0054, device='cuda:0', grad_fn=) cls_loss: tensor(0.0761, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(0.0032, device='cuda:0', grad_fn=) cls_loss: tensor(0.0046, device='cuda:0', grad_fn=) cls_loss: tensor(0.0044, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0039, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0560, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0224, device='cuda:0', grad_fn=) cls_loss: tensor(0.0251, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0101, device='cuda:0', grad_fn=) cls_loss: tensor(0.0200, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0068, device='cuda:0', grad_fn=) cls_loss: tensor(0.0097, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0063, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0285, device='cuda:0', grad_fn=) cls_loss: tensor(0.0158, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0.0031, device='cuda:0', grad_fn=) cls_loss: tensor(0.0207, device='cuda:0', grad_fn=) cls_loss: tensor(9.5095e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) 0.0001 changing lr epoch 41, time 179.17, cls_loss 0.0132 100 cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0030, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0206, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0108, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0076, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0047, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.0168, device='cuda:0', grad_fn=) cls_loss: tensor(0.0156, device='cuda:0', grad_fn=) cls_loss: tensor(9.2566e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0218, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0056, device='cuda:0', grad_fn=) cls_loss: tensor(0.0430, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0045, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0061, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0627, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0264, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(0.0070, device='cuda:0', grad_fn=) cls_loss: tensor(0.0081, device='cuda:0', grad_fn=) cls_loss: tensor(0.0071, device='cuda:0', grad_fn=) cls_loss: tensor(0.0035, device='cuda:0', grad_fn=) cls_loss: tensor(6.8892e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0041, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0047, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0049, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0139, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0029, device='cuda:0', grad_fn=) cls_loss: tensor(0.0030, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(4.0200e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0143, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0033, device='cuda:0', grad_fn=) cls_loss: tensor(0.0167, device='cuda:0', grad_fn=) cls_loss: tensor(5.2713e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0053, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.7599e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(5.8599e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(9.3918e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0049, device='cuda:0', grad_fn=) 0.0001 changing lr ---------------------saving model at epoch 42---------------------------------------------------- epoch 42, time 179.63, cls_loss 0.0041 100 cls_loss: tensor(7.9356e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.7414e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0035, device='cuda:0', grad_fn=) cls_loss: tensor(5.9776e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0032, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(6.6645e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0096, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0526, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0261, device='cuda:0', grad_fn=) cls_loss: tensor(0.0050, device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0122, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(4.1418e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(2.8860e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(1.2007e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0047, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(7.7676e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(9.1750e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(6.0305e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0088, device='cuda:0', grad_fn=) cls_loss: tensor(0.0048, device='cuda:0', grad_fn=) cls_loss: tensor(0.0074, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0032, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0078, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0409, device='cuda:0', grad_fn=) cls_loss: tensor(6.1411e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.7620e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0260, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(7.5098e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0042, device='cuda:0', grad_fn=) cls_loss: tensor(0.0033, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(0.0045, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0046, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0204, device='cuda:0', grad_fn=) cls_loss: tensor(7.2196e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0064, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0491, device='cuda:0', grad_fn=) cls_loss: tensor(6.9924e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.0051, device='cuda:0', grad_fn=) 0.0001 changing lr epoch 43, time 179.33, cls_loss 0.0037 100 cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0097, device='cuda:0', grad_fn=) cls_loss: tensor(0.0077, device='cuda:0', grad_fn=) cls_loss: tensor(0.0691, device='cuda:0', grad_fn=) cls_loss: tensor(0.0033, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0099, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(6.8285e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.5703e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0809, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0991, device='cuda:0', grad_fn=) cls_loss: tensor(0.0166, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(4.0106e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0102, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0413, device='cuda:0', grad_fn=) cls_loss: tensor(0.0320, device='cuda:0', grad_fn=) cls_loss: tensor(0.0619, device='cuda:0', grad_fn=) cls_loss: tensor(0.0060, device='cuda:0', grad_fn=) cls_loss: tensor(0.0065, device='cuda:0', grad_fn=) cls_loss: tensor(0.0092, device='cuda:0', grad_fn=) cls_loss: tensor(3.4485e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0041, device='cuda:0', grad_fn=) cls_loss: tensor(0.0057, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(0.0235, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0373, device='cuda:0', grad_fn=) cls_loss: tensor(0.0108, device='cuda:0', grad_fn=) cls_loss: tensor(0.0291, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(8.3249e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0032, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(0.0036, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0033, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(3.2920e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.6244e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0089, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0112, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0035, device='cuda:0', grad_fn=) cls_loss: tensor(6.9745e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.9668e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0136, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(0.0031, device='cuda:0', grad_fn=) cls_loss: tensor(0.0102, device='cuda:0', grad_fn=) cls_loss: tensor(0.0102, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0264, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0049, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(5.4490e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(8.0355e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0061, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(8.0608e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(9.4481e-05, device='cuda:0', grad_fn=) 0.0001 changing lr epoch 44, time 179.66, cls_loss 0.0074 100 cls_loss: tensor(0.0056, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0048, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(8.2042e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.5420e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.4196e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0098, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0096, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(5.1908e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0083, device='cuda:0', grad_fn=) cls_loss: tensor(0.0753, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0.0049, device='cuda:0', grad_fn=) cls_loss: tensor(0.0044, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(0.0094, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(2.4758e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0078, device='cuda:0', grad_fn=) cls_loss: tensor(0.0104, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.0384, device='cuda:0', grad_fn=) cls_loss: tensor(0.0724, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(4.5024e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(7.6022e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0044, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0187, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(0.2404, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0255, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0029, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0040, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0172, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0426, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0165, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0833, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0427, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0039, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(4.4730e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(0.0034, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0030, device='cuda:0', grad_fn=) cls_loss: tensor(0.0034, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(0.0415, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) 0.0001 changing lr epoch 45, time 179.02, cls_loss 0.0087 100 cls_loss: tensor(6.9752e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(0.0167, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0133, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0059, device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0088, device='cuda:0', grad_fn=) cls_loss: tensor(0.0292, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0036, device='cuda:0', grad_fn=) cls_loss: tensor(0.0088, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(0.0095, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0031, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0663, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0278, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0051, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0105, device='cuda:0', grad_fn=) cls_loss: tensor(0.0106, device='cuda:0', grad_fn=) cls_loss: tensor(0.0029, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0031, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0084, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(0.0152, device='cuda:0', grad_fn=) cls_loss: tensor(0.0092, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0092, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0107, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0264, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0057, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0259, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(6.0629e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0215, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0063, device='cuda:0', grad_fn=) cls_loss: tensor(0.1030, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0118, device='cuda:0', grad_fn=) cls_loss: tensor(3.9775e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(7.1410e-05, device='cuda:0', grad_fn=) 0.0001 changing lr epoch 46, time 179.06, cls_loss 0.0054 100 cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0066, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0.0063, device='cuda:0', grad_fn=) cls_loss: tensor(0.0031, device='cuda:0', grad_fn=) cls_loss: tensor(0.1833, device='cuda:0', grad_fn=) cls_loss: tensor(0.0039, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0670, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0175, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0039, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.0076, device='cuda:0', grad_fn=) cls_loss: tensor(0.1470, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.0325, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.1883, device='cuda:0', grad_fn=) cls_loss: tensor(0.0099, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0250, device='cuda:0', grad_fn=) cls_loss: tensor(0.0341, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(9.3240e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(0.0032, device='cuda:0', grad_fn=) cls_loss: tensor(6.4403e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0283, device='cuda:0', grad_fn=) cls_loss: tensor(0.0057, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0.0162, device='cuda:0', grad_fn=) cls_loss: tensor(0.0095, device='cuda:0', grad_fn=) cls_loss: tensor(0.0187, device='cuda:0', grad_fn=) cls_loss: tensor(0.0181, device='cuda:0', grad_fn=) cls_loss: tensor(0.0075, device='cuda:0', grad_fn=) cls_loss: tensor(0.0089, device='cuda:0', grad_fn=) cls_loss: tensor(0.0041, device='cuda:0', grad_fn=) cls_loss: tensor(0.0042, device='cuda:0', grad_fn=) cls_loss: tensor(0.0320, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.1520, device='cuda:0', grad_fn=) cls_loss: tensor(0.0135, device='cuda:0', grad_fn=) cls_loss: tensor(4.4256e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.3073e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0127, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.0146, device='cuda:0', grad_fn=) cls_loss: tensor(8.1223e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(5.6840e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0157, device='cuda:0', grad_fn=) cls_loss: tensor(0.0050, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(5.0519e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0077, device='cuda:0', grad_fn=) cls_loss: tensor(0.1228, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(8.5801e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0055, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0040, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0047, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0089, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0049, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.0054, device='cuda:0', grad_fn=) cls_loss: tensor(0.0052, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0128, device='cuda:0', grad_fn=) 0.0001 changing lr epoch 47, time 179.17, cls_loss 0.0134 100 cls_loss: tensor(0.0199, device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0065, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(9.6101e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0199, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0045, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0056, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0212, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0030, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0219, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0432, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0614, device='cuda:0', grad_fn=) cls_loss: tensor(1.5151e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(7.9531e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9690e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(0.0033, device='cuda:0', grad_fn=) cls_loss: tensor(0.0887, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(3.5059e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.4574e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0061, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0084, device='cuda:0', grad_fn=) cls_loss: tensor(0.0054, device='cuda:0', grad_fn=) cls_loss: tensor(0.0511, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0052, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(2.3253e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0036, device='cuda:0', grad_fn=) cls_loss: tensor(0.0153, device='cuda:0', grad_fn=) cls_loss: tensor(1.7002e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(9.0845e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0155, device='cuda:0', grad_fn=) cls_loss: tensor(0.0040, device='cuda:0', grad_fn=) cls_loss: tensor(0.0239, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0075, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0071, device='cuda:0', grad_fn=) cls_loss: tensor(2.8923e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(9.3181e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0373, device='cuda:0', grad_fn=) cls_loss: tensor(0.0029, device='cuda:0', grad_fn=) cls_loss: tensor(9.2257e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.3485e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) 0.0001 changing lr epoch 48, time 179.12, cls_loss 0.0055 100 cls_loss: tensor(0.0036, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0062, device='cuda:0', grad_fn=) cls_loss: tensor(6.8180e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.0571, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0046, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(7.0367e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.5617e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(0.0046, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0112, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0109, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0056, device='cuda:0', grad_fn=) cls_loss: tensor(9.6150e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0282, device='cuda:0', grad_fn=) cls_loss: tensor(0.0099, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0114, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.2824e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(4.3999e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.7563e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0360, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0062, device='cuda:0', grad_fn=) cls_loss: tensor(0.0032, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0.0030, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0296, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0042, device='cuda:0', grad_fn=) cls_loss: tensor(0.0051, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(5.3741e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0496, device='cuda:0', grad_fn=) cls_loss: tensor(0.0032, device='cuda:0', grad_fn=) cls_loss: tensor(0.0034, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(7.9487e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0.0078, device='cuda:0', grad_fn=) cls_loss: tensor(0.0038, device='cuda:0', grad_fn=) cls_loss: tensor(0.0292, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.1049, device='cuda:0', grad_fn=) cls_loss: tensor(3.6620e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.3303e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.1610, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0122, device='cuda:0', grad_fn=) 0.0001 changing lr epoch 49, time 179.50, cls_loss 0.0066 100 cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(8.1327e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.8659e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.0627e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0036, device='cuda:0', grad_fn=) cls_loss: tensor(1.1388e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0037, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0029, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0391, device='cuda:0', grad_fn=) cls_loss: tensor(0.0652, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(3.3654e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0072, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0588, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(4.4603e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0274, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0115, device='cuda:0', grad_fn=) cls_loss: tensor(0.1239, device='cuda:0', grad_fn=) cls_loss: tensor(0.0237, device='cuda:0', grad_fn=) cls_loss: tensor(0.0107, device='cuda:0', grad_fn=) cls_loss: tensor(0.0032, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0879, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(7.3303e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0102, device='cuda:0', grad_fn=) cls_loss: tensor(0.0069, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0694, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0033, device='cuda:0', grad_fn=) cls_loss: tensor(0.0061, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0083, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0049, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(0.0082, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0.0050, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(3.4869e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.0034, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.2460e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(3.5796e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) 0.0001 changing lr epoch 50, time 179.28, cls_loss 0.0065 100 cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(2.6017e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0071, device='cuda:0', grad_fn=) cls_loss: tensor(0.0032, device='cuda:0', grad_fn=) cls_loss: tensor(0.0417, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(7.6335e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(4.9271e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0044, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0083, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0141, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(0.0193, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(2.0925e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0039, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0241, device='cuda:0', grad_fn=) cls_loss: tensor(6.0309e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.7805e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.6267e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0585, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(3.6519e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0044, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.0243, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(5.7865e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(9.6519e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.0629, device='cuda:0', grad_fn=) cls_loss: tensor(0.0037, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.1761, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0098, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(7.5851e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0030, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0580, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0038, device='cuda:0', grad_fn=) cls_loss: tensor(0.0175, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0361, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(4.9323e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0111, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0386, device='cuda:0', grad_fn=) cls_loss: tensor(0.0112, device='cuda:0', grad_fn=) 0.0001 changing lr epoch 51, time 179.13, cls_loss 0.0070 100 cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.0237, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0089, device='cuda:0', grad_fn=) cls_loss: tensor(2.8260e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0404, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0059, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0074, device='cuda:0', grad_fn=) cls_loss: tensor(0.0062, device='cuda:0', grad_fn=) cls_loss: tensor(0.0210, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0962, device='cuda:0', grad_fn=) cls_loss: tensor(0.0029, device='cuda:0', grad_fn=) cls_loss: tensor(3.7536e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0029, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(7.7523e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0048, device='cuda:0', grad_fn=) cls_loss: tensor(0.0030, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0219, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0033, device='cuda:0', grad_fn=) cls_loss: tensor(0.2668, device='cuda:0', grad_fn=) cls_loss: tensor(0.0481, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(3.4440e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0049, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(2.5988e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(0.0159, device='cuda:0', grad_fn=) cls_loss: tensor(0.0040, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(0.0053, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(0.0064, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(0.0100, device='cuda:0', grad_fn=) cls_loss: tensor(9.2100e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0120, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(6.4157e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0.0055, device='cuda:0', grad_fn=) cls_loss: tensor(0.0059, device='cuda:0', grad_fn=) cls_loss: tensor(0.0392, device='cuda:0', grad_fn=) cls_loss: tensor(0.0333, device='cuda:0', grad_fn=) cls_loss: tensor(0.0123, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0034, device='cuda:0', grad_fn=) cls_loss: tensor(0.0101, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(9.3438e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0045, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0283, device='cuda:0', grad_fn=) cls_loss: tensor(0.0041, device='cuda:0', grad_fn=) cls_loss: tensor(0.0110, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0050, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0039, device='cuda:0', grad_fn=) cls_loss: tensor(0.0069, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0234, device='cuda:0', grad_fn=) cls_loss: tensor(5.5984e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0069, device='cuda:0', grad_fn=) cls_loss: tensor(5.1837e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) 0.0001 changing lr epoch 52, time 179.50, cls_loss 0.0088 100 cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(0.1270, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(0.0405, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(7.1518e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0800, device='cuda:0', grad_fn=) cls_loss: tensor(1.4760e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(1.0911e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0153, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0057, device='cuda:0', grad_fn=) cls_loss: tensor(0.0206, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0052, device='cuda:0', grad_fn=) cls_loss: tensor(0.0069, device='cuda:0', grad_fn=) cls_loss: tensor(0.0383, device='cuda:0', grad_fn=) cls_loss: tensor(0.0038, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0160, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0156, device='cuda:0', grad_fn=) cls_loss: tensor(0.0646, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(8.9206e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0068, device='cuda:0', grad_fn=) cls_loss: tensor(0.0293, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(8.7496e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(8.4575e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0201, device='cuda:0', grad_fn=) cls_loss: tensor(0.0115, device='cuda:0', grad_fn=) cls_loss: tensor(1.8943e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(0.0860, device='cuda:0', grad_fn=) cls_loss: tensor(0.0049, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0040, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0084, device='cuda:0', grad_fn=) cls_loss: tensor(0.1105, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0059, device='cuda:0', grad_fn=) cls_loss: tensor(0.0172, device='cuda:0', grad_fn=) cls_loss: tensor(0.0110, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0060, device='cuda:0', grad_fn=) cls_loss: tensor(0.0242, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0062, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.1050, device='cuda:0', grad_fn=) cls_loss: tensor(0.0200, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0203, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) 0.0001 changing lr epoch 53, time 179.20, cls_loss 0.0100 100 cls_loss: tensor(0.0049, device='cuda:0', grad_fn=) cls_loss: tensor(0.0107, device='cuda:0', grad_fn=) cls_loss: tensor(0.0037, device='cuda:0', grad_fn=) cls_loss: tensor(0.0420, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0040, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0062, device='cuda:0', grad_fn=) cls_loss: tensor(0.0377, device='cuda:0', grad_fn=) cls_loss: tensor(0.0070, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0.0629, device='cuda:0', grad_fn=) cls_loss: tensor(0.0037, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0061, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0947, device='cuda:0', grad_fn=) cls_loss: tensor(0.0178, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0236, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.9563e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0058, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(7.8242e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0200, device='cuda:0', grad_fn=) cls_loss: tensor(0.0180, device='cuda:0', grad_fn=) cls_loss: tensor(0.0033, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0060, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.0063, device='cuda:0', grad_fn=) cls_loss: tensor(0.0073, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(1.7736e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0107, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(5.1219e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0072, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0059, device='cuda:0', grad_fn=) cls_loss: tensor(0.0119, device='cuda:0', grad_fn=) cls_loss: tensor(0.0041, device='cuda:0', grad_fn=) cls_loss: tensor(0.0104, device='cuda:0', grad_fn=) cls_loss: tensor(2.6118e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0087, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(5.0329e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(5.5075e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.8784e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.6962e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.0461e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(9.1176e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(9.6627e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(3.4805e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) 0.0001 changing lr epoch 54, time 180.20, cls_loss 0.0050 100 cls_loss: tensor(1.5691e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0935, device='cuda:0', grad_fn=) cls_loss: tensor(0.0049, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(5.8584e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0077, device='cuda:0', grad_fn=) cls_loss: tensor(2.1018e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(2.0716e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.2393, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0034, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0406, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(7.7624e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0060, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(3.2000e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.2927e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.6200e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0127, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0042, device='cuda:0', grad_fn=) cls_loss: tensor(3.9052e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0426, device='cuda:0', grad_fn=) cls_loss: tensor(8.7529e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(5.8398e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.7819e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.4357e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0660, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0047, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(3.4187e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0061, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0049, device='cuda:0', grad_fn=) cls_loss: tensor(4.1518e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0029, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(5.2240e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0034, device='cuda:0', grad_fn=) cls_loss: tensor(0.0151, device='cuda:0', grad_fn=) 0.0001 changing lr epoch 55, time 178.71, cls_loss 0.0061 100 cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0086, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0071, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(8.7246e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0157, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(1.0736e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0438, device='cuda:0', grad_fn=) cls_loss: tensor(0.0037, device='cuda:0', grad_fn=) cls_loss: tensor(0.0142, device='cuda:0', grad_fn=) cls_loss: tensor(7.7210e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(9.6191e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(0.2287, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0112, device='cuda:0', grad_fn=) cls_loss: tensor(7.2718e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0051, device='cuda:0', grad_fn=) cls_loss: tensor(0.0071, device='cuda:0', grad_fn=) cls_loss: tensor(2.6226e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0037, device='cuda:0', grad_fn=) cls_loss: tensor(0.0121, device='cuda:0', grad_fn=) cls_loss: tensor(2.0690e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0047, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0039, device='cuda:0', grad_fn=) cls_loss: tensor(0.1112, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(8.3130e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0160, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0138, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0096, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0136, device='cuda:0', grad_fn=) cls_loss: tensor(3.7260e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0121, device='cuda:0', grad_fn=) cls_loss: tensor(0.0036, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(3.7447e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0162, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0082, device='cuda:0', grad_fn=) cls_loss: tensor(3.4202e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0084, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) 0.0001 changing lr epoch 56, time 179.30, cls_loss 0.0064 100 cls_loss: tensor(4.4063e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0036, device='cuda:0', grad_fn=) cls_loss: tensor(0.0032, device='cuda:0', grad_fn=) cls_loss: tensor(4.6175e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0040, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0117, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0215, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(3.1110e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0035, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0032, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(7.9259e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(7.3984e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.8046e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0124, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(9.9227e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0040, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(8.6371e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9298e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.3889e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0124, device='cuda:0', grad_fn=) cls_loss: tensor(6.0797e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.5403e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(4.7430e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.3579e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0180, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(7.1004e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0038, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(5.6740e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0082, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(6.1397e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0144, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(0.0702, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(7.5191e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.4198e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0287, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0090, device='cuda:0', grad_fn=) cls_loss: tensor(4.6786e-05, device='cuda:0', grad_fn=) 0.0001 changing lr epoch 57, time 179.10, cls_loss 0.0028 100 cls_loss: tensor(3.8136e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.9579e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(4.3076e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0466, device='cuda:0', grad_fn=) cls_loss: tensor(0.0132, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(7.3597e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.0347e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0079, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(8.6907e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0090, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0069, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(8.0578e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0281, device='cuda:0', grad_fn=) cls_loss: tensor(3.8378e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0057, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0039, device='cuda:0', grad_fn=) cls_loss: tensor(2.8342e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(9.2831e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.8456e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(1.9461e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(7.5281e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0141, device='cuda:0', grad_fn=) cls_loss: tensor(6.7890e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0121, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0059, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0504, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0101, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(5.9072e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.3063e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0049, device='cuda:0', grad_fn=) cls_loss: tensor(9.6336e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(8.5257e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(3.9127e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.1160, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(2.9575e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(6.3702e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(4.8790e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0315, device='cuda:0', grad_fn=) cls_loss: tensor(0.0140, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(0.0051, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(2.4572e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.9397e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0071, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(2.2672e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) 0.0001 changing lr epoch 58, time 179.17, cls_loss 0.0044 100 cls_loss: tensor(0.0043, device='cuda:0', grad_fn=) cls_loss: tensor(9.4153e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0035, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(4.7065e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.1011e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0654, device='cuda:0', grad_fn=) cls_loss: tensor(3.8501e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(1.5680e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0171, device='cuda:0', grad_fn=) cls_loss: tensor(0.0146, device='cuda:0', grad_fn=) cls_loss: tensor(0.0040, device='cuda:0', grad_fn=) cls_loss: tensor(0.0543, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0070, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.3246e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(8.6229e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(9.5703e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0327, device='cuda:0', grad_fn=) cls_loss: tensor(0.0303, device='cuda:0', grad_fn=) cls_loss: tensor(8.6594e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.0487e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.1776e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(0.0036, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0030, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.3298, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0046, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(0.0262, device='cuda:0', grad_fn=) cls_loss: tensor(0.0092, device='cuda:0', grad_fn=) cls_loss: tensor(0.0046, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0241, device='cuda:0', grad_fn=) cls_loss: tensor(0.0800, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(6.5140e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.1471, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(4.1228e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0352, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0217, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0074, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0035, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0041, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0250, device='cuda:0', grad_fn=) cls_loss: tensor(0.0064, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0.0154, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(1.5177e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0505, device='cuda:0', grad_fn=) cls_loss: tensor(0.0136, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0035, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.0201, device='cuda:0', grad_fn=) cls_loss: tensor(0.0058, device='cuda:0', grad_fn=) cls_loss: tensor(0.0398, device='cuda:0', grad_fn=) cls_loss: tensor(8.8289e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) 0.0001 changing lr epoch 59, time 179.42, cls_loss 0.0115 100 cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0169, device='cuda:0', grad_fn=) cls_loss: tensor(0.0516, device='cuda:0', grad_fn=) cls_loss: tensor(0.0064, device='cuda:0', grad_fn=) cls_loss: tensor(0.0040, device='cuda:0', grad_fn=) cls_loss: tensor(0.0070, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0077, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0051, device='cuda:0', grad_fn=) cls_loss: tensor(3.9767e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0029, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.1183, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(5.6423e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0877, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(3.0786e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0099, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0437, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.1707, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(2.1122e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(0.0091, device='cuda:0', grad_fn=) cls_loss: tensor(4.3552e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0.3951, device='cuda:0', grad_fn=) cls_loss: tensor(0.0200, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.1189, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.0041, device='cuda:0', grad_fn=) cls_loss: tensor(7.5500e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.2337, device='cuda:0', grad_fn=) cls_loss: tensor(0.0033, device='cuda:0', grad_fn=) cls_loss: tensor(0.0038, device='cuda:0', grad_fn=) cls_loss: tensor(0.0602, device='cuda:0', grad_fn=) cls_loss: tensor(9.9689e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0200, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(6.8087e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0152, device='cuda:0', grad_fn=) cls_loss: tensor(0.0085, device='cuda:0', grad_fn=) cls_loss: tensor(0.0033, device='cuda:0', grad_fn=) cls_loss: tensor(0.0119, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0062, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0153, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0112, device='cuda:0', grad_fn=) cls_loss: tensor(0.0069, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0043, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0118, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0087, device='cuda:0', grad_fn=) cls_loss: tensor(5.9247e-05, device='cuda:0', grad_fn=) 0.0001 changing lr epoch 60, time 179.30, cls_loss 0.0156 100 cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(8.0537e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(8.0235e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0470, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0047, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(5.9608e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0177, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0054, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(0.0032, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(7.9915e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(4.1556e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.3875e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0169, device='cuda:0', grad_fn=) cls_loss: tensor(0.0053, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.4881e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0061, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0222, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0186, device='cuda:0', grad_fn=) cls_loss: tensor(1.4517e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.2003e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0267, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(4.0617e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(6.2373e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0080, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(6.7540e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0092, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0123, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(8.4631e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0099, device='cuda:0', grad_fn=) cls_loss: tensor(9.8638e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.8728e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(8.5875e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0179, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0286, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) 0.0001 changing lr epoch 61, time 179.43, cls_loss 0.0031 100 cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0080, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0126, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0160, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0086, device='cuda:0', grad_fn=) cls_loss: tensor(3.4258e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0083, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0051, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(5.6431e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0039, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(3.4217e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.6678e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.4157e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0034, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0617, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0036, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0143, device='cuda:0', grad_fn=) cls_loss: tensor(0.0031, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.0200, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(3.5360e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(8.3905e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1075e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0044, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(2.8770e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0038, device='cuda:0', grad_fn=) cls_loss: tensor(0.0090, device='cuda:0', grad_fn=) cls_loss: tensor(4.6935e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0075, device='cuda:0', grad_fn=) cls_loss: tensor(5.4460e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.7438e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.2883e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(8.8476e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0353, device='cuda:0', grad_fn=) cls_loss: tensor(9.2059e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.3326e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(7.9188e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(6.1393e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(2.7481e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.2039e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0604, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0039, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0152, device='cuda:0', grad_fn=) 0.0001 changing lr epoch 62, time 179.66, cls_loss 0.0034 100 cls_loss: tensor(3.2596e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(9.3725e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.4028, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(0.0515, device='cuda:0', grad_fn=) cls_loss: tensor(0.0203, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(5.1051e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(0.0032, device='cuda:0', grad_fn=) cls_loss: tensor(0.0232, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(4.2617e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.1493, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(0.1665, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(1.1854e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0088, device='cuda:0', grad_fn=) cls_loss: tensor(0.0048, device='cuda:0', grad_fn=) cls_loss: tensor(0.0194, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.0240, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.2705, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0041, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(0.0114, device='cuda:0', grad_fn=) cls_loss: tensor(0.0359, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0160, device='cuda:0', grad_fn=) cls_loss: tensor(0.0186, device='cuda:0', grad_fn=) cls_loss: tensor(0.0132, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(9.6884e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0098, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0049, device='cuda:0', grad_fn=) cls_loss: tensor(0.0031, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(6.7677e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0032, device='cuda:0', grad_fn=) cls_loss: tensor(6.6426e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0038, device='cuda:0', grad_fn=) cls_loss: tensor(0.0047, device='cuda:0', grad_fn=) cls_loss: tensor(0.0110, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0163, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(5.1610e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0033, device='cuda:0', grad_fn=) cls_loss: tensor(0.0104, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0078, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0609, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0126, device='cuda:0', grad_fn=) cls_loss: tensor(0.0034, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0031, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(3.1494e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(6.3617e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0032, device='cuda:0', grad_fn=) cls_loss: tensor(4.7106e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0724, device='cuda:0', grad_fn=) cls_loss: tensor(6.9577e-05, device='cuda:0', grad_fn=) 0.0001 changing lr epoch 63, time 179.26, cls_loss 0.0154 100 cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(1.8403e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0107, device='cuda:0', grad_fn=) cls_loss: tensor(6.2253e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0031, device='cuda:0', grad_fn=) cls_loss: tensor(0.0031, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(6.9715e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0306, device='cuda:0', grad_fn=) cls_loss: tensor(0.0051, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(7.5392e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1373e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.9871e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(5.3570e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(1.9770e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(3.1732e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(6.3669e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0030, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(1.1720e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(6.8214e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(7.4133e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0052, device='cuda:0', grad_fn=) cls_loss: tensor(7.8678e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0047, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(9.5837e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(6.2980e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(9.2119e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.1067e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(6.6660e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(1.2841e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0074, device='cuda:0', grad_fn=) cls_loss: tensor(5.9869e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0034, device='cuda:0', grad_fn=) cls_loss: tensor(8.5667e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.8245e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.5387e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0180, device='cuda:0', grad_fn=) cls_loss: tensor(9.4272e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(0.0159, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0047, device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(4.2118e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.6624e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) 0.0001 changing lr epoch 64, time 179.39, cls_loss 0.0016 100 cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(5.0813e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0400, device='cuda:0', grad_fn=) cls_loss: tensor(0.0038, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(4.1936e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(1.2312e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.6924e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(9.8534e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(9.2223e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.9271e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.1955e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.6046e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(7.2706e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0046, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0110, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(5.3268e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.5192e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(2.5969e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(5.5194e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(9.2387e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(8.4378e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.0136e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(8.6810e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0032, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(2.2367e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1265e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(4.0792e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.9150e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0379, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0146, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0156, device='cuda:0', grad_fn=) cls_loss: tensor(0.0085, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0064, device='cuda:0', grad_fn=) cls_loss: tensor(8.0027e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(2.4952e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.7694e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(1.4611e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0572, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(3.4906e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.7679e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(8.5868e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(4.1388e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0103, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) 0.0001 changing lr epoch 65, time 179.12, cls_loss 0.0026 100 cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0049, device='cuda:0', grad_fn=) cls_loss: tensor(0.0082, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.9266e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(7.1246e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.2841e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(4.7296e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(6.5248e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(6.8557e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(1.9372e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(4.9669e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(3.3651e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(1.2115e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(6.2209e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(5.8025e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(3.9004e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(7.6137e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.7891e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.6462e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.9357e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(2.1581e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0047, device='cuda:0', grad_fn=) cls_loss: tensor(3.3908e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0233, device='cuda:0', grad_fn=) cls_loss: tensor(5.2020e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(2.3469e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(1.1198e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.8983e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.2485e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.3993e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.2439e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0107, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(3.4194e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(9.9875e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0142, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(5.0984e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.3544e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.7345e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.5605e-05, device='cuda:0', grad_fn=) 0.0001 changing lr epoch 66, time 179.32, cls_loss 0.0010 100 cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(8.5831e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0577, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(2.7534e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.5928e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.5786e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.8764e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.6801e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0220, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.0920e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0123, device='cuda:0', grad_fn=) cls_loss: tensor(1.2599e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.1147e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(5.4892e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0043, device='cuda:0', grad_fn=) cls_loss: tensor(0.0344, device='cuda:0', grad_fn=) cls_loss: tensor(0.0056, device='cuda:0', grad_fn=) cls_loss: tensor(0.0033, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(8.8230e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0433, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0067, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(6.2592e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(3.9965e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.4470e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0049, device='cuda:0', grad_fn=) cls_loss: tensor(1.1090e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(7.5419e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(2.8782e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0032, device='cuda:0', grad_fn=) cls_loss: tensor(0.0897, device='cuda:0', grad_fn=) cls_loss: tensor(2.6040e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(2.1085e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0515, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0.1160, device='cuda:0', grad_fn=) cls_loss: tensor(8.5149e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(2.5865e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0223, device='cuda:0', grad_fn=) cls_loss: tensor(0.0107, device='cuda:0', grad_fn=) cls_loss: tensor(0.0548, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(2.1838e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.4096e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0115, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(8.9630e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(0.0157, device='cuda:0', grad_fn=) cls_loss: tensor(0.0111, device='cuda:0', grad_fn=) cls_loss: tensor(0.0301, device='cuda:0', grad_fn=) cls_loss: tensor(0.0065, device='cuda:0', grad_fn=) cls_loss: tensor(0.0205, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0155, device='cuda:0', grad_fn=) cls_loss: tensor(0.0060, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0373, device='cuda:0', grad_fn=) cls_loss: tensor(6.0722e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0042, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(8.7123e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0036, device='cuda:0', grad_fn=) cls_loss: tensor(6.3185e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) 0.0001 changing lr epoch 67, time 179.10, cls_loss 0.0074 100 cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(6.8329e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.5577e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0300, device='cuda:0', grad_fn=) cls_loss: tensor(1.9684e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0081, device='cuda:0', grad_fn=) cls_loss: tensor(0.0031, device='cuda:0', grad_fn=) cls_loss: tensor(3.1900e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0163, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(0.0071, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(9.7893e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0928, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.0041, device='cuda:0', grad_fn=) cls_loss: tensor(0.0069, device='cuda:0', grad_fn=) cls_loss: tensor(0.0033, device='cuda:0', grad_fn=) cls_loss: tensor(0.0032, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0233, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.8957e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9801e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0378, device='cuda:0', grad_fn=) cls_loss: tensor(0.0033, device='cuda:0', grad_fn=) cls_loss: tensor(2.5056e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0218, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(9.4950e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.8273e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(2.6114e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0516, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0029, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(2.6040e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(3.5353e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(0.0103, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(0.0042, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0669, device='cuda:0', grad_fn=) cls_loss: tensor(1.4596e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(3.6288e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0030, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(2.4062e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.3130e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.7145e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0292, device='cuda:0', grad_fn=) cls_loss: tensor(5.0936e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0127, device='cuda:0', grad_fn=) cls_loss: tensor(0.0974, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0214, device='cuda:0', grad_fn=) cls_loss: tensor(5.3462e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.1683e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0029, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) 0.0001 changing lr epoch 68, time 179.10, cls_loss 0.0060 100 cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0049, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0082, device='cuda:0', grad_fn=) cls_loss: tensor(6.2436e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.1349, device='cuda:0', grad_fn=) cls_loss: tensor(3.3528e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(2.5194e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0323, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(5.6360e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0371, device='cuda:0', grad_fn=) cls_loss: tensor(0.0048, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(5.6941e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0166, device='cuda:0', grad_fn=) cls_loss: tensor(7.3429e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(4.2632e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.6587e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0126, device='cuda:0', grad_fn=) cls_loss: tensor(9.9536e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0239, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(7.5430e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.0066, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.0562e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(1.5013e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(1.5050e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(2.4166e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(4.4115e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.0090e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(8.3573e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(4.4096e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0519, device='cuda:0', grad_fn=) cls_loss: tensor(5.3838e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0313, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(2.3521e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(0.0064, device='cuda:0', grad_fn=) cls_loss: tensor(0.0114, device='cuda:0', grad_fn=) cls_loss: tensor(0.0261, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0137, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0033, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.4752e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0036, device='cuda:0', grad_fn=) cls_loss: tensor(2.5038e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0100, device='cuda:0', grad_fn=) cls_loss: tensor(0.2233, device='cuda:0', grad_fn=) cls_loss: tensor(4.2226e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0076, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(8.2843e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) 0.0001 changing lr epoch 69, time 179.34, cls_loss 0.0071 100 cls_loss: tensor(0.0203, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0041, device='cuda:0', grad_fn=) cls_loss: tensor(0.0074, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0289, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(8.2165e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0597, device='cuda:0', grad_fn=) cls_loss: tensor(0.0029, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0057, device='cuda:0', grad_fn=) cls_loss: tensor(0.0036, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(1.7554e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.7857e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0571, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(8.9888e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.6091e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(5.8290e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.2435, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(8.9698e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0050, device='cuda:0', grad_fn=) cls_loss: tensor(0.1133, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(0.0032, device='cuda:0', grad_fn=) cls_loss: tensor(0.0287, device='cuda:0', grad_fn=) cls_loss: tensor(6.1717e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0568, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0.0055, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0259, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(0.0061, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0029, device='cuda:0', grad_fn=) cls_loss: tensor(0.0851, device='cuda:0', grad_fn=) cls_loss: tensor(5.1942e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(0.0412, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0259, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0223, device='cuda:0', grad_fn=) cls_loss: tensor(0.0107, device='cuda:0', grad_fn=) cls_loss: tensor(0.0128, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0045, device='cuda:0', grad_fn=) cls_loss: tensor(2.0362e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0046, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(0.0074, device='cuda:0', grad_fn=) cls_loss: tensor(2.1268e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0039, device='cuda:0', grad_fn=) cls_loss: tensor(0.0090, device='cuda:0', grad_fn=) cls_loss: tensor(0.0389, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(0.1103, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) 0.0001 changing lr epoch 70, time 179.09, cls_loss 0.0111 100 cls_loss: tensor(8.5853e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0382, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0069, device='cuda:0', grad_fn=) cls_loss: tensor(0.0249, device='cuda:0', grad_fn=) cls_loss: tensor(0.0038, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.5367e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.8287e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.2733e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0074, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0039, device='cuda:0', grad_fn=) cls_loss: tensor(0.0103, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(6.8285e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(6.7525e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0093, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(7.5810e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(0.0098, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(8.5007e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0048, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(9.3564e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0048, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(9.3061e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.1553e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0029, device='cuda:0', grad_fn=) cls_loss: tensor(0.0134, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(0.0508, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(4.4182e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0057, device='cuda:0', grad_fn=) cls_loss: tensor(0.0143, device='cuda:0', grad_fn=) cls_loss: tensor(0.0043, device='cuda:0', grad_fn=) cls_loss: tensor(4.0889e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0080, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0092, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0055, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(6.9804e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.4213, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0527, device='cuda:0', grad_fn=) cls_loss: tensor(6.6396e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(0.0109, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0085, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.4110e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0216, device='cuda:0', grad_fn=) cls_loss: tensor(0.0033, device='cuda:0', grad_fn=) cls_loss: tensor(0.1114, device='cuda:0', grad_fn=) cls_loss: tensor(0.0719, device='cuda:0', grad_fn=) cls_loss: tensor(0.0050, device='cuda:0', grad_fn=) cls_loss: tensor(0.0333, device='cuda:0', grad_fn=) cls_loss: tensor(0.0585, device='cuda:0', grad_fn=) cls_loss: tensor(0.0174, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0029, device='cuda:0', grad_fn=) cls_loss: tensor(0.0298, device='cuda:0', grad_fn=) cls_loss: tensor(0.0570, device='cuda:0', grad_fn=) 0.0001 changing lr epoch 71, time 178.65, cls_loss 0.0118 100 cls_loss: tensor(0.0116, device='cuda:0', grad_fn=) cls_loss: tensor(0.0066, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0100, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(0.0743, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.1197, device='cuda:0', grad_fn=) cls_loss: tensor(0.0788, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(4.9964e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0028, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(3.5483e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.5587e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0104, device='cuda:0', grad_fn=) cls_loss: tensor(0.0142, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.0468, device='cuda:0', grad_fn=) cls_loss: tensor(0.0088, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0333, device='cuda:0', grad_fn=) cls_loss: tensor(0.0078, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0054, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0651, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0262, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.0033, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(6.5561e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0134, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(6.2075e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0031, device='cuda:0', grad_fn=) cls_loss: tensor(0.0114, device='cuda:0', grad_fn=) cls_loss: tensor(0.0168, device='cuda:0', grad_fn=) cls_loss: tensor(0.0096, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(3.2187e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0046, device='cuda:0', grad_fn=) cls_loss: tensor(2.0456e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.1285e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0071, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(5.9020e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(7.7624e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0365, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0043, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0107, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(5.6352e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0289, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0211, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(5.5309e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(2.5094e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) 0.0001 changing lr epoch 72, time 178.69, cls_loss 0.0074 100 cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0044, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(7.6685e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.7228e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0149, device='cuda:0', grad_fn=) cls_loss: tensor(4.2748e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0077, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0092, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(7.5750e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0660, device='cuda:0', grad_fn=) cls_loss: tensor(0.0177, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0101, device='cuda:0', grad_fn=) cls_loss: tensor(0.0065, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(6.6336e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0095, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0089, device='cuda:0', grad_fn=) cls_loss: tensor(0.0031, device='cuda:0', grad_fn=) cls_loss: tensor(0.0044, device='cuda:0', grad_fn=) cls_loss: tensor(1.2670e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(0.0123, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.8068e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0084, device='cuda:0', grad_fn=) cls_loss: tensor(0.0044, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.6154e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(3.1773e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(7.9140e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0160, device='cuda:0', grad_fn=) cls_loss: tensor(1.9137e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0305, device='cuda:0', grad_fn=) cls_loss: tensor(0.0056, device='cuda:0', grad_fn=) cls_loss: tensor(4.9751e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0213, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(8.0038e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.1502e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.9050e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(4.0218e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0277, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0073, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(2.6230e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0032, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(3.5603e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.5096e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.3682e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) 0.0001 changing lr ---------------------saving model at epoch 73---------------------------------------------------- epoch 73, time 178.81, cls_loss 0.0035 100 cls_loss: tensor(4.3157e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(1.0882e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0030, device='cuda:0', grad_fn=) cls_loss: tensor(7.1153e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(5.9966e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(9.8124e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0088, device='cuda:0', grad_fn=) cls_loss: tensor(5.4523e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.5103e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9754e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(4.1898e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.9951e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.3986e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.1173e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0032, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(8.2929e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.4959e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.7854e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.7786e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(1.5199e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(8.8587e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(4.3552e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(3.8829e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.2873e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(1.0330e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.3434e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.0165e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.1340e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.5317e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(6.6005e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.1749e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.3035e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.0834e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.0807e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.9593e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(4.5478e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.8599e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.2420e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0250, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.2308e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(8.6911e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(4.1191e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.1263, device='cuda:0', grad_fn=) cls_loss: tensor(3.5271e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.3790e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) 0.0001 changing lr ---------------------saving model at epoch 74---------------------------------------------------- epoch 74, time 178.84, cls_loss 0.0019 100 cls_loss: tensor(7.1581e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(6.0152e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.8453e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(7.1902e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0022, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0083, device='cuda:0', grad_fn=) cls_loss: tensor(0.0089, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.1688, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(1.1321e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(1.8552e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0356, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.9212e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0107, device='cuda:0', grad_fn=) cls_loss: tensor(1.6719e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0035, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.2056e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.7425e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.0233e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.4156e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.0030e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.2348e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.2066e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0057, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(7.7233e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(9.7170e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0034, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0026, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0032, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(1.7349e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(3.4612e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(8.0746e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(3.2224e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0057, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.4794e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.8335e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(2.6915e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(4.9796e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(9.0994e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(3.2306e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0203, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0027, device='cuda:0', grad_fn=) cls_loss: tensor(6.5763e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.4707e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.8121e-05, device='cuda:0', grad_fn=) 0.0001 changing lr epoch 75, time 178.30, cls_loss 0.0032 100 cls_loss: tensor(6.8396e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(2.6774e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.6142e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0017, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.1567e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.2116e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(5.5570e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(4.6518e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(7.6756e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.2863e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.2930e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.3958e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.4910e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0025, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.0921e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(5.4169e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.4906e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.6209e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(1.3191e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0119, device='cuda:0', grad_fn=) cls_loss: tensor(8.9765e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.2954e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.2320e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.0601e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.4719e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.8967e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.6902e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.4607e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(5.2255e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0080, device='cuda:0', grad_fn=) cls_loss: tensor(9.5889e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.4399e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(4.4890e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.7896e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.5008e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.1935e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1560e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.4757e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.7078e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(4.8392e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(8.4933e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.1781e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0040, device='cuda:0', grad_fn=) cls_loss: tensor(0.0018, device='cuda:0', grad_fn=) cls_loss: tensor(3.7793e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(1.5028e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(4.8954e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(5.4464e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(2.2594e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.9777e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.2701e-06, device='cuda:0', grad_fn=) 0.0001 changing lr epoch 76, time 179.04, cls_loss 0.0006 100 cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0020, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(3.4213e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(4.5337e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.8274e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(4.2066e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.0154e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0099, device='cuda:0', grad_fn=) cls_loss: tensor(7.1377e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(6.7353e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.4145e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.0591e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(5.0928e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(7.0427e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(6.1937e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(6.2946e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(3.6214e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(9.6794e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.1765e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.0560e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.3068e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.3274e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9856e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.0973e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(5.5768e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(6.4399e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9003e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(9.8083e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(4.6864e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.0447e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(5.2541e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0014, device='cuda:0', grad_fn=) cls_loss: tensor(2.6893e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(4.1600e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0049, device='cuda:0', grad_fn=) cls_loss: tensor(3.5495e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.2465e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.6578e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(8.2970e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.3913e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.3757e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9584e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.4283e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(9.0156e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.7041e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.6503e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.2164e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.1048e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(2.8100e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(2.0713e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(5.6643e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.5516e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.1278e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.2964e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(6.1553e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.4022e-06, device='cuda:0', grad_fn=) 0.0001 changing lr ---------------------saving model at epoch 77---------------------------------------------------- epoch 77, time 178.92, cls_loss 0.0004 100 cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(4.7900e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.6502e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.8056e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(2.6636e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.9203e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.2151e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(4.5743e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.2553e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.2592e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(4.1921e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.0525e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(9.4917e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.5091e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0103, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.4802e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.6310e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.9938e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.1835e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(1.1701e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(3.1874e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0033, device='cuda:0', grad_fn=) cls_loss: tensor(9.3058e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.8897e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.2262e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.8497e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(9.7416e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0019, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.6617e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.2610e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.9263e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.1232e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(2.4438e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.0554e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.0539e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.1543e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.2343e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.4423e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0035, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(5.2303e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.0827e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.7621e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(7.7989e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(8.4192e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9887e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(6.6083e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.8124e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(8.5287e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.4354e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.2467e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(3.9104e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.8121e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.3413e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(5.9426e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.1754e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.6877e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.6041e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.2840e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(4.2256e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.2069e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) 0.0001 changing lr epoch 78, time 178.94, cls_loss 0.0004 100 cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(3.2835e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.0340e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3471e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.2870e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.0537e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.6083e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(5.0228e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(4.6194e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.6038e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.9858e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.4567e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(6.7279e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.4382e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.4785e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(4.1388e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.5290e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.2557e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.8417e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.2469e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.0787e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.1042e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(2.2341e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.6459e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(9.6560e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(8.1211e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.7017e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(8.2590e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.4725e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.4755e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(6.5230e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.6201e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(1.4659e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.1105e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.9594e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9232e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(6.8642e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(1.0528e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(5.7407e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(3.9153e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.6561e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.2932e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(4.8041e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.3447e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0023, device='cuda:0', grad_fn=) cls_loss: tensor(1.4775e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9521e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.1581e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(9.9748e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(9.0107e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(6.9749e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.0087e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(7.7244e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.5449e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.7224e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9009e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.0992e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.3337e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.7305e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.9057e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.6454e-06, device='cuda:0', grad_fn=) 0.0001 changing lr epoch 79, time 178.91, cls_loss 0.0002 100 cls_loss: tensor(7.8112e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.6466e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.6928e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.8086e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.3106e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.8523e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.5893e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(5.7369e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.8456e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.3028e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.7999e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.4987e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.2691e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(1.1340e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3717e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.3543e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.9627e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(4.6976e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.8795e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.7839e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(5.8450e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(9.2015e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.3627e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.6173e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.0186e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.5075e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(4.0617e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.6710e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.8323e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(9.5934e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(4.3958e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.3626e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3422e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(8.8241e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(8.4527e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(5.4732e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.2405e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.2569e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.6444e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(8.2098e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.4000e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.8575e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(6.0651e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.4315e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(5.7384e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0021, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(2.2627e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(2.3093e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(5.9705e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.1642e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.5507e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.0080e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1876e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(3.9052e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.4692e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.4059e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(7.3306e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(4.4815e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.7680e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(9.4101e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) 1e-05 changing lr epoch 80, time 178.67, cls_loss 0.0001 100 cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.2595e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0011, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(8.0466e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.5982e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.7538e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.2467e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.8872e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.6624e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.3242e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.4105e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.6219e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.1383e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.1975e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.7690e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.9307e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.0125e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.3272e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(7.4286e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.8056e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.1295e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.4050e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.7835e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.5095e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.4764e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.0741e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(6.0290e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.4698e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.2836e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.2480e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.6557e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(5.3626e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.8296e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.5705e-07, device='cuda:0', grad_fn=) cls_loss: tensor(7.8902e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.7046e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3135e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(8.9668e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.0416e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.7496e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.1979e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.0224e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.4650e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.4650e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.6200e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.6280e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.3623e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.1712e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.2362e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.6801e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(5.5991e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.0199e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.6971e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.7572e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.6744e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.1495e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.2498e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.8158e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(8.1580e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.1064e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.6299e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(5.1383e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.5528e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.1081e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.3860e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.9729e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.3341e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(5.5619e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(3.0991e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.0967e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.0878e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(2.6070e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.7791e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.8685e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.3576e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.3647e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.8131e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.4990e-05, device='cuda:0', grad_fn=) 1e-05 changing lr epoch 81, time 178.92, cls_loss 0.0001 100 cls_loss: tensor(1.3568e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.3031e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.9926e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1012e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.2580e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.8040e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1478e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.5371e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.3346e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.7293e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.0966e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.8300e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.7347e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.3106e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.9527e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.6846e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.4546e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1176e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.1435e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.5021e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.9316e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(4.2059e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9098e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.2098e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.2128e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.5005e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.5525e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.3074e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.5602e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.1437e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.0707e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.6997e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.5384e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.2123e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.1472e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.1778e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0016, device='cuda:0', grad_fn=) cls_loss: tensor(4.3672e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(4.2021e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.8566e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.4480e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.5825e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(4.7050e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.4149e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(9.4622e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.1577e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.3602e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.0300e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0013, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(6.4176e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0010, device='cuda:0', grad_fn=) cls_loss: tensor(8.3968e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.1126e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.8408e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.6317e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.8110e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.4152e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.5162e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9790e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.0392e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.9579e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.0211e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.6038e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.5565e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.5854e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.2314e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.7502e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.9761e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(9.3505e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.0207e-06, device='cuda:0', grad_fn=) 1e-05 changing lr epoch 82, time 178.87, cls_loss 0.0001 100 cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.2175e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.2314e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.9117e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(4.7535e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.9569e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.9543e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.1751e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.9209e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.6863e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(5.5224e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.7248e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(5.8949e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(6.0495e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.5321e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.1213e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(9.0051e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(1.4015e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(5.6919e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.3284e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(6.4746e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(9.1523e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.6479e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.0489e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.8126e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(4.8432e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.9088e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(5.4199e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(2.0474e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(9.6574e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.6154e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(8.5082e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.4716e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9605e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1910e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(9.0152e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.1197e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.5246e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.4773e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.7996e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.0111e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(3.1006e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3437e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(3.6418e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(7.7412e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.2909e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.9626e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(4.2912e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(6.6750e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.2517e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(7.1932e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.4566e-05, device='cuda:0', grad_fn=) 1e-05 changing lr epoch 83, time 179.10, cls_loss 0.0001 100 cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(5.0645e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.4952e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.7608e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(6.5349e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9977e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(4.0606e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.1547e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.6263e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9919e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.8023e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.7746e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.3187e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(4.0978e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.4505e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(4.8179e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.7417e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.6887e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(7.1060e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0086, device='cuda:0', grad_fn=) cls_loss: tensor(1.4111e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(4.8943e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.2680e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.2311e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.7540e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(1.2390e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.2918e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.3310e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(2.5153e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.0356e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.1836e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.6387e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.5380e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.9444e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.0757e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.6848e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0024, device='cuda:0', grad_fn=) cls_loss: tensor(2.1443e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.9717e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.7291e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.8726e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9765e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.1949e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(7.7497e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(4.8913e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.0892e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(6.4820e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.5390e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.3285e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.0782e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.5322e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.6566e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3463e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.0129e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.3499e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.1782e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.3894e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.8533e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.2605e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.9542e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1902e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1828e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.2346e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(5.3402e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(6.3755e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.0707e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.6954e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) 1e-05 changing lr epoch 84, time 179.33, cls_loss 0.0002 100 cls_loss: tensor(1.1861e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.3899e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.6189e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(6.8989e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.3209e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.7577e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.6315e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(3.2816e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.2708e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.0835e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.0709e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3135e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(5.7798e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.5266e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.6060e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.7660e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.8189e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.1910e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.8422e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.0206e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(8.8215e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.9161e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.6587e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.4514e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.0990e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.2691e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.3782e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.4750e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.8060e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.7187e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.4477e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.2787e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.8864e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.7993e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.7676e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.1555e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.2238e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1962e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.8366e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.2266e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.0005e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(6.0674e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.0692e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.4506e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.4100e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.4342e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.4074e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.8981e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.4789e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.7889e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(7.9721e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.6168e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.5635e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.8253e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.1456e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.0511e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(7.1526e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.8563e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.9189e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(5.8427e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(6.8165e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.6317e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.5609e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.7635e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.4373e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(5.1733e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.1342e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.6778e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.8727e-05, device='cuda:0', grad_fn=) 1e-05 changing lr epoch 85, time 178.58, cls_loss 0.0001 100 cls_loss: tensor(4.1053e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.7365e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(8.9172e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.0289e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.5482e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.8236e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.4794e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.3767e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.1926e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(4.8619e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.0216e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.9397e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.0175e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.9374e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.5769e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.8552e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.5624e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1295e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(9.0465e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.6905e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.3043e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(8.8699e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(5.2139e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(9.1735e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.8498e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.6205e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(5.9985e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(4.1835e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.3154e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.4146e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.8325e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.5786e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.2276e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.2629e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.6609e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.7146e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.3950e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.7505e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.0755e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(1.9588e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(4.4685e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(2.6822e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.1989e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.4501e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.5262e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.0342e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.1859e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.9252e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(4.8913e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9190e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.2778e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.6638e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.5852e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(6.3784e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(5.0459e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.5066e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.0677e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9823e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.3877e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.8852e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.9878e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.6241e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.0088e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.4768e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.7936e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.3693e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.0564e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.6508e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(6.6184e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) 1e-05 changing lr epoch 86, time 178.76, cls_loss 0.0001 100 cls_loss: tensor(4.2751e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.4275e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9759e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.4849e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.4345e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.2805e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(7.4841e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.4922e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.7703e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(8.6375e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(7.2435e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.0154e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.1786e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.7083e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.6021e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.3479e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.2029e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.0684e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.4619e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.5937e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.2209e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.8508e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.0683e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.1833e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.1691e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.2651e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(4.3646e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(2.7195e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(6.9529e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(2.7426e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.8768e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.7885e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.6202e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.6151e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.8764e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.4043e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.8430e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.7110e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(5.4479e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.1527e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.2795e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(6.4075e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.2538e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.1174e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.1788e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.2899e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.5220e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.5805e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(4.9416e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9210e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.1295e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.7743e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.1494e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.5655e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.2507e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.1690e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.5113e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.7508e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.1292e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.6399e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.5190e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(2.6789e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.4801e-05, device='cuda:0', grad_fn=) 1e-05 changing lr epoch 87, time 178.82, cls_loss 0.0001 100 cls_loss: tensor(2.1681e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.8261e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.3335e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.2528e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(4.3817e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.1983e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.1680e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.5552e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.7100e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.3741e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.8762e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.4450e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.1244e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.6981e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.5279e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3597e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(8.8934e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.1469e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9376e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.2361e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(6.5193e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(1.4506e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.4095e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.9504e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.6494e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.1010e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(1.0278e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(6.5941e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.8943e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.6025e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.7667e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.0718e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.9585e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.5025e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.4699e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.6066e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.6797e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.2440e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.0338e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3866e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3694e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(7.2546e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.0082e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.6603e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(4.6108e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.2151e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.6692e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.2564e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(5.5991e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(5.3849e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(3.6471e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.3788e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.8899e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3020e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.7705e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(8.9407e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.3039e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.1079e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.8855e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.0431e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(5.1569e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(3.8557e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.4417e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.1588e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.4640e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.6306e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.0479e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3594e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3184e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.3358e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) 1e-05 changing lr epoch 88, time 178.67, cls_loss 0.0001 100 cls_loss: tensor(0.0015, device='cuda:0', grad_fn=) cls_loss: tensor(6.9834e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.0628e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3381e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.8183e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.7048e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.7742e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(5.2243e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.5426e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(1.7878e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.5300e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.6157e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.6364e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.0510e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.1866e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9642e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.0814e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.8994e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.5337e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.2043e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.7431e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.3553e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.9445e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.3108e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.0986e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.9630e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(2.9873e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.1262e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9185e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.0836e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.4250e-07, device='cuda:0', grad_fn=) cls_loss: tensor(5.8040e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.5568e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(7.6368e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.0220e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.4650e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.2603e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.5746e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.2960e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.6065e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.6734e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.8019e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.8727e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(7.2546e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.5071e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.3680e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.8348e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.5842e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.3916e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.8012e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(7.3761e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.3505e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.5688e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.2202e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.2229e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.7290e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.8068e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(2.1007e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9521e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.3146e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(4.8392e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.2517e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.2986e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.8775e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.6131e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.2692e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.2282e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.1360e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1697e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(2.7344e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.9524e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.1269e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.4624e-05, device='cuda:0', grad_fn=) 1e-05 changing lr epoch 89, time 178.88, cls_loss 0.0001 100 cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(6.6638e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.7912e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(4.9956e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.1047e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(2.7604e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.4193e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(1.6626e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.8023e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.7940e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.7956e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.2990e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3225e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.5453e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.1780e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.8194e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.9975e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(3.8110e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.6439e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(4.4703e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.2783e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.5785e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.0792e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.8370e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.8795e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.5949e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.0275e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.3868e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.8173e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(7.0445e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.6745e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.6411e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.4012e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.2524e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.9222e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9858e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.2346e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.2832e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.2329e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.1704e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.7770e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.4823e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.0419e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(3.4764e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.0941e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.9593e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.6209e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.8977e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.7684e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.0408e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.7660e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.6475e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.0518e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.2680e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(7.8667e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(7.8611e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.8869e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.3868e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.2318e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.2692e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.7193e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.0859e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.5267e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1459e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.4537e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.1479e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(8.8092e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(4.3288e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.9494e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.8148e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.1336e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.0484e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.5434e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0012, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.5257e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.8871e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(8.3819e-07, device='cuda:0', grad_fn=) 1e-05 changing lr epoch 90, time 179.28, cls_loss 0.0001 100 cls_loss: tensor(6.8072e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.4317e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(6.6161e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.8008e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.2023e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.7407e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(9.3505e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.2569e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.2414e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.7104e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.4789e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.8763e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.7283e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(5.8986e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(9.6615e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.0649e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.9128e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.1914e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.3463e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.0261e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.8152e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.4611e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1038e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(9.8161e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.7886e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.2807e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.3824e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.5677e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.9545e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9622e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.2113e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9856e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.3528e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(6.4228e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.9556e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1057e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.8659e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.7367e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.2585e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(3.7178e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(2.7418e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(1.6876e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.0668e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.3975e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.8302e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.9169e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.6309e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(7.6536e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(3.3334e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.5214e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.4394e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.6525e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(6.9402e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3448e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.3887e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.6745e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.3382e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.2234e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.1614e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.0249e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.1265e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.1978e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.5256e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(8.4564e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.3246e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(6.7875e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.9935e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.6194e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.7437e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.3830e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9328e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.1910e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.2121e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.2527e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.9898e-06, device='cuda:0', grad_fn=) 1e-05 changing lr epoch 91, time 179.30, cls_loss 0.0001 100 cls_loss: tensor(4.7684e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.0936e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(5.1010e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.2435e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.5475e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.1676e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.3299e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(8.5063e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(5.8498e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.4338e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.8413e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.3937e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.2054e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.4927e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1146e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.7805e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.3788e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.3678e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.2836e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.7472e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.6450e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.6661e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.5086e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(6.2473e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.0986e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(6.9372e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.8110e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.6571e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.9486e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.0395e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.6525e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.2875e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(9.4499e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.1033e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.4024e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.7299e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.5457e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.4901e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.9353e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.1396e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(8.0951e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(8.3786e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.7369e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0007, device='cuda:0', grad_fn=) cls_loss: tensor(3.4258e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.1898e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.8585e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.9643e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.7015e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.9595e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.6806e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9045e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(5.0075e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(7.4320e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.8157e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.6132e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.7869e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9158e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.0308e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.4500e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.0817e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.3049e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.3348e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.0634e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(3.8788e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.5070e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.6343e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.2826e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9965e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(7.4089e-05, device='cuda:0', grad_fn=) 1e-05 changing lr epoch 92, time 179.17, cls_loss 0.0001 100 cls_loss: tensor(8.4154e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0009, device='cuda:0', grad_fn=) cls_loss: tensor(1.1027e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.8662e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.7583e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.0052e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.0639e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.3171e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.2538e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.2975e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.0117e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.9024e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.0853e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.4627e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(4.0062e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(6.3404e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.6913e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.5460e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.2573e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.4736e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.3670e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.6637e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.4399e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.2415e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.1441e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.5092e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9357e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.8836e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(7.6856e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9047e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.3384e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.0646e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.9204e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.0992e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.7049e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.3795e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.5936e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(9.6112e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.2986e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.7329e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(6.7715e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.7044e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.7621e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(6.3375e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9943e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.9504e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3597e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(7.1231e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.2349e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(4.2539e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.9435e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.0962e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.2655e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9078e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.6571e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.0878e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(8.4937e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.2971e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.7208e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.3484e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.4792e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.4108e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3370e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.4339e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.4503e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.7759e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.9616e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.7817e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(7.3496e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.8328e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.2585e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.2888e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.0009e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.9072e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.6954e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1720e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.0812e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) 1e-05 changing lr epoch 93, time 179.46, cls_loss 0.0001 100 cls_loss: tensor(5.9307e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(6.9290e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.0713e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.1548e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(7.4260e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9813e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.6693e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0008, device='cuda:0', grad_fn=) cls_loss: tensor(2.0657e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.0035e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.7880e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.2251e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.2995e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.6801e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(5.8323e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(8.8934e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.1833e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.8168e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(5.5935e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.3840e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(7.4469e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.7439e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.1745e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.5034e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.0878e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.1180e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.4570e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.7423e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.6131e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.5932e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(6.9145e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.2024e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.2081e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(7.7959e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(5.2884e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.2666e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.1122e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.0495e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9955e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.7699e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(4.4212e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(2.2493e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.0494e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.1802e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.6520e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.7098e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(4.7799e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.8768e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.5128e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.1780e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.1125e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.1372e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(8.0355e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.1732e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.6955e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.4573e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(2.4799e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(8.3633e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.1988e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.3371e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.5270e-05, device='cuda:0', grad_fn=) 1e-05 changing lr epoch 94, time 179.61, cls_loss 0.0001 100 cls_loss: tensor(6.8359e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.7097e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.2873e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.4448e-07, device='cuda:0', grad_fn=) cls_loss: tensor(9.1456e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.1642e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.5512e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3802e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1399e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.7129e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.1713e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(7.8876e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.5744e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.3628e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.2960e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.8266e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.1114e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.4334e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.7509e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.5757e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.6941e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.4421e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.2989e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.5133e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.3858e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(4.3269e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.9792e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.6212e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.9850e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.0910e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(8.2478e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(5.0701e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.2791e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.1402e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.4214e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(8.8505e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.6657e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.4995e-07, device='cuda:0', grad_fn=) cls_loss: tensor(4.2766e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.8231e-07, device='cuda:0', grad_fn=) cls_loss: tensor(3.2298e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.9804e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(1.3411e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.2426e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.3798e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.4238e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.4421e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(5.8986e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.4913e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.0508e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(5.2489e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.1886e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.7170e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.8949e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.5065e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.2791e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(3.2689e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.6297e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.6882e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.4815e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.1752e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.9423e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(2.1234e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1202e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.4622e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.0456e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.1590e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(2.9482e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(9.6746e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3586e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.5337e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.0466e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.4887e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.2797e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(4.1038e-05, device='cuda:0', grad_fn=) 1e-05 changing lr epoch 95, time 179.34, cls_loss 0.0001 100 cls_loss: tensor(1.4622e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.9332e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.5528e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.5995e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.6082e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3839e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.1845e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.7486e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.3256e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.9725e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.1014e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(4.5799e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.4445e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.6238e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.6528e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(9.4894e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(5.0247e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.2096e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.2873e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9700e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9041e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.6619e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(9.7524e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.7638e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.6426e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.1064e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(6.6403e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.6834e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.2025e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.4149e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.0511e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.9169e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.9607e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.5714e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.6544e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.1579e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.4380e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.7017e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.9730e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.3055e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.4529e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.7839e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.7865e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.0502e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3411e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.6190e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.9961e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.6021e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.7742e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.6473e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.7109e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.8476e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.7800e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.5943e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9723e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.1890e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(8.1733e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.5704e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.0261e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.0973e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.5406e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(2.6003e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(3.6836e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.7303e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.6003e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.8741e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.0259e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.5474e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.7037e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.5273e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3597e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.7187e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.2965e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.2264e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.3944e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.2709e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.6043e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.9164e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.0843e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.2312e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.1456e-06, device='cuda:0', grad_fn=) 1e-05 changing lr epoch 96, time 179.23, cls_loss 0.0001 100 cls_loss: tensor(6.0860e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(2.9057e-07, device='cuda:0', grad_fn=) cls_loss: tensor(8.8289e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.2859e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.2923e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.8109e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.6880e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.2297e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.9320e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.0005e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.5841e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.1820e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.6859e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.8467e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.3189e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.3146e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.3945e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.0431e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.0152e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.0181e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.6156e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.2288e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.3815e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.5189e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.0883e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.6033e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.0729e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.5461e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.3442e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.2990e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.5899e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.1681e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(7.3873e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.8342e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.5565e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.0310e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(8.5231e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(6.4652e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.1281e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.3994e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.0870e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.4223e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(4.0203e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.1312e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.1747e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3143e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.2836e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.7940e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(9.3605e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.7904e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.1691e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.3954e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.8934e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.3029e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.1254e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.4250e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.2966e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.8740e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.9781e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.2314e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(6.7014e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.6201e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.6622e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.3768e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.6302e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(4.7162e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(1.5520e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.8107e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.5337e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.0109e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.0325e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.0908e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.4937e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.3875e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.8871e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.6385e-06, device='cuda:0', grad_fn=) cls_loss: tensor(7.0129e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.5788e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.8959e-05, device='cuda:0', grad_fn=) 1e-05 changing lr epoch 97, time 179.40, cls_loss 0.0001 100 cls_loss: tensor(1.2740e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.1946e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.1730e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(4.6570e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(7.7181e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.8679e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.0617e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.4770e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.1458e-06, device='cuda:0', grad_fn=) cls_loss: tensor(6.3542e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(5.3648e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.7428e-07, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.1456e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.1197e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.4980e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(9.4537e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.7993e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9841e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9347e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.8007e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.9737e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(5.3234e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(9.3020e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.9759e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.4957e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.2277e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.1101e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.6725e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.2029e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(3.1408e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.0915e-06, device='cuda:0', grad_fn=) cls_loss: tensor(8.3152e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.3015e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.4754e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.0717e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.0027e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.4521e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.7582e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(5.8651e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.6599e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.4553e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(8.7354e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0005, device='cuda:0', grad_fn=) cls_loss: tensor(1.0729e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.3437e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0006, device='cuda:0', grad_fn=) cls_loss: tensor(1.0718e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.9935e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.8889e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(2.2002e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.6051e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(2.0143e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.4090e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.2003e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(5.4128e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.2960e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.3291e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.2919e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.8487e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.8450e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(7.0114e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.3346e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.5996e-07, device='cuda:0', grad_fn=) cls_loss: tensor(2.4173e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.6477e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.8215e-05, device='cuda:0', grad_fn=) 1e-05 changing lr epoch 98, time 178.99, cls_loss 0.0001 100 cls_loss: tensor(2.8454e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.8163e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.0148e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.6323e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.4434e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.7084e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3139e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.6933e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.7285e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.9691e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(5.7481e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(9.9327e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.2101e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.7725e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.5790e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.3285e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.2521e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(7.4401e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0004, device='cuda:0', grad_fn=) cls_loss: tensor(2.4814e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.1585e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.3664e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.4827e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.2882e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.2331e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.0397e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.5615e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.7588e-06, device='cuda:0', grad_fn=) cls_loss: tensor(1.2532e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.1668e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.5867e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3757e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(2.8845e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.7531e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.0407e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.2089e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.9614e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.3600e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.0355e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.2105e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.8290e-05, device='cuda:0', grad_fn=) cls_loss: tensor(8.2295e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.2852e-06, device='cuda:0', grad_fn=) cls_loss: tensor(2.2460e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.6131e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.9279e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(4.3768e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.5188e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(1.3027e-05, device='cuda:0', grad_fn=) cls_loss: tensor(6.8545e-07, device='cuda:0', grad_fn=) cls_loss: tensor(1.9424e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.3724e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.7579e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0002, device='cuda:0', grad_fn=) cls_loss: tensor(3.2987e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.3209e-06, device='cuda:0', grad_fn=) cls_loss: tensor(5.1491e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.5646e-05, device='cuda:0', grad_fn=) cls_loss: tensor(7.0408e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(3.1404e-06, device='cuda:0', grad_fn=) cls_loss: tensor(3.2317e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.0502e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(1.1098e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.6855e-05, device='cuda:0', grad_fn=) cls_loss: tensor(5.9046e-06, device='cuda:0', grad_fn=) cls_loss: tensor(4.5791e-05, device='cuda:0', grad_fn=) cls_loss: tensor(9.4797e-05, device='cuda:0', grad_fn=) cls_loss: tensor(4.0319e-05, device='cuda:0', grad_fn=) cls_loss: tensor(2.7966e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.1367e-06, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(0.0003, device='cuda:0', grad_fn=) cls_loss: tensor(1.4432e-05, device='cuda:0', grad_fn=) cls_loss: tensor(0.0001, device='cuda:0', grad_fn=) cls_loss: tensor(5.2132e-05, device='cuda:0', grad_fn=) cls_loss: tensor(1.6056e-06, device='cuda:0', grad_fn=) cls_loss: tensor(9.1378e-05, device='cuda:0', grad_fn=) cls_loss: tensor(3.0231e-05, device='cuda:0', grad_fn=) 1e-05 changing lr epoch 99, time 179.35, cls_loss 0.0001 ---------------------saving last model at epoch 99---------------------------------------------------- /home/yuqian_fu {'gpu': '0', 'svroot': '/data/work-gcp-europe-west4-a/yuqian_fu/datasets/SingleSourceDG/saved-digit/CA_multiple_14fa_all_ep100_lr1e-4_lr_schedulerStep0.8_bs32_lamCa_1_lamRe_1_cls1_adt2_EW2_100_rmTrue_rnTrue_str3', 'svpath': '/data/work-gcp-europe-west4-a/yuqian_fu/datasets/SingleSourceDG/saved-digit/CA_multiple_14fa_all_ep100_lr1e-4_lr_schedulerStep0.8_bs32_lamCa_1_lamRe_1_cls1_adt2_EW2_100_rmTrue_rnTrue_str3/14factor_best.csv', 'channels': 3, 'factor_num': 14, 'stride': 3, 'epoch': 'best', 'eval_mapping': True} loading weight of best Using downloaded and verified file: /home/yuqian_fu/.pytorch/SVHN/test_32x32.mat mnist svhn ... usps Avg w/o do (original x) 98.76 28.284419 ... 80.568012 51.575641 [1 rows x 6 columns]