pretraining fmri mae start: 2026-02-06 22:36:27 cwd: /admin/home/connor/fmri-fm sha: d20519e5856f827bd427b02eb9f01a851fc41e98, status: has uncommitted changes, branch: dev/clane9 config: name: patch_size/patch8/pretrain notes: patch_size ablations patch8 (patch_size=8 mask_patch_size=16) output_dir: experiments/patch_size/output/patch_size/patch8/pretrain input_space: flat patch_size: 8 num_frames: 16 t_patch_size: 4 mask_ratio: 0.9 pred_mask_ratio: null masking: tube masking_kwargs: {} mask_patch_size: 16 model: mae_vit_base model_kwargs: decoding: attn pos_embed: sep target_norm: null pca_norm_nc: 2 t_pred_stride: 2 no_decode_pos: true mask_drop_scale: false pred_edge_pad: 0 gauss_sigma: null class_token: true reg_tokens: 0 no_embed_class: true head_init_scale: 0.0 decoder_depth: 4 drop_path_rate: 0.0 datasets: hcp-train: type: wds url: /data/fmri-datasets/pretrain/hcpya-all.${input_space}.wds/hcpya-all-${input_space}-{00000..01799}.tar clipping: random clipping_kwargs: oversample: 4.0 shuffle: true buffer_size: 2000 samples_per_epoch: 200000 hcp-train-subset: type: arrow root: s3://medarc/fmri-datasets/eval/hcpya-clips.${input_space}.arrow/validation split_range: - 0 - 2000 shuffle: false hcp-val: type: arrow root: s3://medarc/fmri-datasets/eval/hcpya-clips.${input_space}.arrow/test split_range: - 0 - 2000 shuffle: false train_dataset: hcp-train eval_datasets: - hcp-train-subset - hcp-val val_dataset: null clip_vmax: 3.0 normalize: frame tr_scale: null crop_scale: null crop_aspect: null gray_jitter: null num_workers: 16 epochs: 100 batch_size: 32 accum_iter: 1 base_lr: 0.001 min_lr: 0.0 warmup_epochs: 5 weight_decay: 0.05 betas: - 0.9 - 0.95 clip_grad: 1.0 amp: true amp_dtype: float16 ckpt: null resume: true auto_resume: true start_epoch: 0 max_checkpoints: 0 checkpoint_period: null plot_period: 1 device: cuda presend_cuda: false seed: 6500 debug: false wandb: true wandb_entity: null wandb_project: fMRI-foundation-model rank: 0 world_size: 1 gpu: 0 distributed: true dist_backend: nccl in_chans: 1 img_size: - 224 - 560 train transform: Compose( ToTensor() TemporalCenterCrop(num_frames=16) Normalize(mode='frame') Clip(vmax=3.0) FlatUnmask((224, 560)) ) val transform: Compose( ToTensor() TemporalCenterCrop(num_frames=16) Normalize(mode='frame') Clip(vmax=3.0) FlatUnmask((224, 560)) ) mask generator: TubeMasking( mask_ratio=0.9 (patchify): Patchify2D((224, 560), (16, 16), in_chans=1) ) loading dataset: hcp-train type: wds url: /data/fmri-datasets/pretrain/hcpya-all.${input_space}.wds/hcpya-all-${input_space}-{00000..01799}.tar clipping: random clipping_kwargs: oversample: 4.0 shuffle: true buffer_size: 2000 samples_per_epoch: 200000 loading dataset: hcp-train-subset type: arrow root: s3://medarc/fmri-datasets/eval/hcpya-clips.${input_space}.arrow/validation split_range: - 0 - 2000 shuffle: false split indices: [899, 472, 767, 116, 1265, 1852, 300, 1335, 361, 1560] loading dataset: hcp-val type: arrow root: s3://medarc/fmri-datasets/eval/hcpya-clips.${input_space}.arrow/test split_range: - 0 - 2000 shuffle: false split indices: [1075, 1189, 738, 1350, 965, 1964, 1367, 1183, 1619, 1407] model: MaskedAutoencoderViT( decoding=attn, t_pred_stride=2, pred_edge_pad=0, no_decode_pos=True (encoder): MaskedEncoder( class_token=True, reg_tokens=0, no_embed_class=True, mask_drop_scale=False (patchify): Patchify3D((16, 224, 560), (4, 8, 8), in_chans=1) (patch_embed): Linear(in_features=256, out_features=768, bias=True) (pos_embed): SeparablePosEmbed(768, (4, 28, 70)) (blocks): ModuleList( (0-11): 12 x Block( (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True) (attn): Attention( num_heads=12 (q): Linear(in_features=768, out_features=768, bias=True) (k): Linear(in_features=768, out_features=768, bias=True) (v): Linear(in_features=768, out_features=768, bias=True) (proj): Linear(in_features=768, out_features=768, bias=True) ) (drop_path1): Identity() (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True) (mlp): Mlp( (fc1): Linear(in_features=768, out_features=3072, bias=True) (act): GELU(approximate='none') (fc2): Linear(in_features=3072, out_features=768, bias=True) ) (drop_path2): Identity() ) ) (norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True) ) (pred_patchify): StridedPatchify3D((16, 224, 560), (2, 8, 8), in_chans=1, t_stride=2) (decoder): MaskedDecoder( cross_decode=False, class_token=True, no_embed_class=True (pos_embed): SeparablePosEmbed(512, (4, 28, 70)) (proj): Linear(in_features=768, out_features=512, bias=True) (blocks): ModuleList( (0-3): 4 x Block( (norm1): LayerNorm((512,), eps=1e-06, elementwise_affine=True) (attn): Attention( num_heads=16 (q): Linear(in_features=512, out_features=512, bias=True) (k): Linear(in_features=512, out_features=512, bias=True) (v): Linear(in_features=512, out_features=512, bias=True) (proj): Linear(in_features=512, out_features=512, bias=True) ) (drop_path1): Identity() (norm2): LayerNorm((512,), eps=1e-06, elementwise_affine=True) (mlp): Mlp( (fc1): Linear(in_features=512, out_features=2048, bias=True) (act): GELU(approximate='none') (fc2): Linear(in_features=2048, out_features=512, bias=True) ) (drop_path2): Identity() ) ) (norm): LayerNorm((512,), eps=1e-06, elementwise_affine=True) (head): Linear(in_features=512, out_features=128, bias=True) ) ) num params: 100.8M total batch size: 32 = 32 bs per gpu x 1 accum x 1 gpus lr: 1.25e-04 = 1.00e-03 x 32 / 256 full schedule: epochs = 100 (steps = 625000) warmup: epochs = 5 (steps = 31250) start training for 100 epochs Train: [0] [ 0/6250] eta: 20:17:30 lr: 0.000000 grad: 0.0257 (0.0257) loss: 0.9975 (0.9975) time: 11.6881 data: 9.0837 max mem: 24896 Train: [0] [ 100/6250] eta: 0:48:07 lr: 0.000000 grad: 0.0130 (0.0141) loss: 0.9959 (0.9961) time: 0.2674 data: 0.0002 max mem: 26028 Train: [0] [ 200/6250] eta: 0:37:14 lr: 0.000001 grad: 0.0112 (0.0133) loss: 0.9963 (0.9959) time: 0.2680 data: 0.0002 max mem: 26028 Train: [0] [ 300/6250] eta: 0:33:18 lr: 0.000001 grad: 0.0110 (0.0127) loss: 0.9960 (0.9959) time: 0.2714 data: 0.0002 max mem: 26030 Train: [0] [ 400/6250] eta: 0:31:05 lr: 0.000002 grad: 0.0117 (0.0124) loss: 0.9959 (0.9959) time: 0.2685 data: 0.0002 max mem: 26030 Train: [0] [ 500/6250] eta: 0:29:35 lr: 0.000002 grad: 0.0119 (0.0123) loss: 0.9957 (0.9959) time: 0.2665 data: 0.0002 max mem: 26030 Train: [0] [ 600/6250] eta: 0:28:25 lr: 0.000002 grad: 0.0116 (0.0121) loss: 0.9961 (0.9959) time: 0.2678 data: 0.0002 max mem: 26030 Train: [0] [ 700/6250] eta: 0:27:28 lr: 0.000003 grad: 0.0109 (0.0120) loss: 0.9957 (0.9959) time: 0.2671 data: 0.0002 max mem: 26030 Train: [0] [ 800/6250] eta: 0:26:39 lr: 0.000003 grad: 0.0125 (0.0120) loss: 0.9956 (0.9959) time: 0.2674 data: 0.0002 max mem: 26030 Train: [0] [ 900/6250] eta: 0:25:53 lr: 0.000004 grad: 0.0117 (0.0119) loss: 0.9954 (0.9959) time: 0.2660 data: 0.0001 max mem: 26030 Train: [0] [1000/6250] eta: 0:25:12 lr: 0.000004 grad: 0.0122 (0.0119) loss: 0.9959 (0.9959) time: 0.2671 data: 0.0002 max mem: 26075 Train: [0] [1100/6250] eta: 0:24:34 lr: 0.000004 grad: 0.0139 (0.0120) loss: 0.9960 (0.9959) time: 0.2664 data: 0.0002 max mem: 26075 Train: [0] [1200/6250] eta: 0:23:57 lr: 0.000005 grad: 0.0167 (0.0124) loss: 0.9957 (0.9959) time: 0.2677 data: 0.0002 max mem: 26075 Train: [0] [1300/6250] eta: 0:23:22 lr: 0.000005 grad: 0.0200 (0.0130) loss: 0.9955 (0.9959) time: 0.2685 data: 0.0002 max mem: 26075 Train: [0] [1400/6250] eta: 0:22:49 lr: 0.000006 grad: 0.0236 (0.0138) loss: 0.9956 (0.9959) time: 0.2691 data: 0.0002 max mem: 26075 Train: [0] [1500/6250] eta: 0:22:16 lr: 0.000006 grad: 0.0300 (0.0153) loss: 0.9954 (0.9958) time: 0.2700 data: 0.0002 max mem: 26075 Train: [0] [1600/6250] eta: 0:21:44 lr: 0.000006 grad: 0.0367 (0.0171) loss: 0.9951 (0.9958) time: 0.2682 data: 0.0002 max mem: 26075 Train: [0] [1700/6250] eta: 0:21:13 lr: 0.000007 grad: 0.0554 (0.0192) loss: 0.9950 (0.9958) time: 0.2677 data: 0.0002 max mem: 26075 Train: [0] [1800/6250] eta: 0:20:41 lr: 0.000007 grad: 0.0417 (0.0212) loss: 0.9950 (0.9957) time: 0.2666 data: 0.0001 max mem: 26075 Train: [0] [1900/6250] eta: 0:20:11 lr: 0.000008 grad: 0.0409 (0.0227) loss: 0.9950 (0.9957) time: 0.2673 data: 0.0002 max mem: 26075 Train: [0] [2000/6250] eta: 0:19:41 lr: 0.000008 grad: 0.0420 (0.0244) loss: 0.9949 (0.9956) time: 0.2691 data: 0.0002 max mem: 26075 Train: [0] [2100/6250] eta: 0:19:11 lr: 0.000008 grad: 0.0411 (0.0254) loss: 0.9953 (0.9956) time: 0.2702 data: 0.0002 max mem: 26075 Train: [0] [2200/6250] eta: 0:18:41 lr: 0.000009 grad: 0.0537 (0.0266) loss: 0.9948 (0.9956) time: 0.2680 data: 0.0002 max mem: 26075 Train: [0] [2300/6250] eta: 0:18:12 lr: 0.000009 grad: 0.0495 (0.0279) loss: 0.9947 (0.9955) time: 0.2679 data: 0.0002 max mem: 26075 Train: [0] [2400/6250] eta: 0:17:43 lr: 0.000010 grad: 0.0446 (0.0289) loss: 0.9943 (0.9955) time: 0.2670 data: 0.0002 max mem: 26075 Train: [0] [2500/6250] eta: 0:17:14 lr: 0.000010 grad: 0.0471 (0.0298) loss: 0.9941 (0.9955) time: 0.2687 data: 0.0002 max mem: 26075 Train: [0] [2600/6250] eta: 0:16:46 lr: 0.000010 grad: 0.0435 (0.0306) loss: 0.9949 (0.9954) time: 0.2681 data: 0.0002 max mem: 26075 Train: [0] [2700/6250] eta: 0:16:17 lr: 0.000011 grad: 0.0483 (0.0314) loss: 0.9940 (0.9954) time: 0.2664 data: 0.0002 max mem: 26075 Train: [0] [2800/6250] eta: 0:15:49 lr: 0.000011 grad: 0.0494 (0.0323) loss: 0.9939 (0.9953) time: 0.2684 data: 0.0002 max mem: 26075 Train: [0] [2900/6250] eta: 0:15:20 lr: 0.000012 grad: 0.0605 (0.0333) loss: 0.9937 (0.9953) time: 0.2666 data: 0.0002 max mem: 26075 Train: [0] [3000/6250] eta: 0:14:52 lr: 0.000012 grad: 0.0527 (0.0342) loss: 0.9938 (0.9952) time: 0.2679 data: 0.0002 max mem: 26075 Train: [0] [3100/6250] eta: 0:14:24 lr: 0.000012 grad: 0.0684 (0.0351) loss: 0.9930 (0.9952) time: 0.2673 data: 0.0002 max mem: 26075 Train: [0] [3200/6250] eta: 0:13:56 lr: 0.000013 grad: 0.0683 (0.0362) loss: 0.9933 (0.9951) time: 0.2675 data: 0.0002 max mem: 26075 Train: [0] [3300/6250] eta: 0:13:28 lr: 0.000013 grad: 0.0632 (0.0373) loss: 0.9922 (0.9950) time: 0.2683 data: 0.0002 max mem: 26075 Train: [0] [3400/6250] eta: 0:13:00 lr: 0.000014 grad: 0.0781 (0.0384) loss: 0.9920 (0.9949) time: 0.2676 data: 0.0001 max mem: 26075 Train: [0] [3500/6250] eta: 0:12:32 lr: 0.000014 grad: 0.0624 (0.0394) loss: 0.9926 (0.9949) time: 0.2672 data: 0.0002 max mem: 26075 Train: [0] [3600/6250] eta: 0:12:04 lr: 0.000014 grad: 0.0685 (0.0405) loss: 0.9918 (0.9948) time: 0.2678 data: 0.0002 max mem: 26075 Train: [0] [3700/6250] eta: 0:11:36 lr: 0.000015 grad: 0.0722 (0.0417) loss: 0.9920 (0.9947) time: 0.2679 data: 0.0002 max mem: 26075 Train: [0] [3800/6250] eta: 0:11:09 lr: 0.000015 grad: 0.0724 (0.0427) loss: 0.9906 (0.9946) time: 0.2672 data: 0.0002 max mem: 26075 Train: [0] [3900/6250] eta: 0:10:41 lr: 0.000016 grad: 0.0691 (0.0436) loss: 0.9916 (0.9945) time: 0.2682 data: 0.0001 max mem: 26075 Train: [0] [4000/6250] eta: 0:10:14 lr: 0.000016 grad: 0.0709 (0.0444) loss: 0.9925 (0.9945) time: 0.2675 data: 0.0002 max mem: 26075 Train: [0] [4100/6250] eta: 0:09:46 lr: 0.000016 grad: 0.0766 (0.0453) loss: 0.9921 (0.9944) time: 0.2689 data: 0.0002 max mem: 26075 Train: [0] [4200/6250] eta: 0:09:18 lr: 0.000017 grad: 0.0719 (0.0461) loss: 0.9910 (0.9943) time: 0.2686 data: 0.0002 max mem: 26075 Train: [0] [4300/6250] eta: 0:08:51 lr: 0.000017 grad: 0.0859 (0.0470) loss: 0.9913 (0.9942) time: 0.2675 data: 0.0002 max mem: 26075 Train: [0] [4400/6250] eta: 0:08:24 lr: 0.000018 grad: 0.0688 (0.0477) loss: 0.9914 (0.9942) time: 0.2690 data: 0.0002 max mem: 26075 Train: [0] [4500/6250] eta: 0:07:56 lr: 0.000018 grad: 0.0776 (0.0485) loss: 0.9898 (0.9941) time: 0.2686 data: 0.0002 max mem: 26075 Train: [0] [4600/6250] eta: 0:07:29 lr: 0.000018 grad: 0.0893 (0.0492) loss: 0.9905 (0.9940) time: 0.2678 data: 0.0002 max mem: 26075 Train: [0] [4700/6250] eta: 0:07:01 lr: 0.000019 grad: 0.0777 (0.0500) loss: 0.9892 (0.9939) time: 0.2670 data: 0.0002 max mem: 26075 Train: [0] [4800/6250] eta: 0:06:34 lr: 0.000019 grad: 0.0714 (0.0506) loss: 0.9914 (0.9938) time: 0.2675 data: 0.0002 max mem: 26075 Train: [0] [4900/6250] eta: 0:06:07 lr: 0.000020 grad: 0.0625 (0.0511) loss: 0.9927 (0.9938) time: 0.2667 data: 0.0001 max mem: 26075 Train: [0] [5000/6250] eta: 0:05:39 lr: 0.000020 grad: 0.0864 (0.0517) loss: 0.9912 (0.9937) time: 0.2683 data: 0.0002 max mem: 26109 Train: [0] [5100/6250] eta: 0:05:12 lr: 0.000020 grad: 0.0661 (0.0522) loss: 0.9913 (0.9937) time: 0.2676 data: 0.0002 max mem: 26109 Train: [0] [5200/6250] eta: 0:04:45 lr: 0.000021 grad: 0.0631 (0.0527) loss: 0.9913 (0.9936) time: 0.2668 data: 0.0001 max mem: 26109 Train: [0] [5300/6250] eta: 0:04:18 lr: 0.000021 grad: 0.0752 (0.0533) loss: 0.9901 (0.9936) time: 0.2679 data: 0.0001 max mem: 26109 Train: [0] [5400/6250] eta: 0:03:50 lr: 0.000022 grad: 0.0788 (0.0537) loss: 0.9902 (0.9935) time: 0.2689 data: 0.0002 max mem: 26109 Train: [0] [5500/6250] eta: 0:03:23 lr: 0.000022 grad: 0.0828 (0.0542) loss: 0.9915 (0.9935) time: 0.2682 data: 0.0002 max mem: 26109 Train: [0] [5600/6250] eta: 0:02:56 lr: 0.000022 grad: 0.0630 (0.0546) loss: 0.9920 (0.9934) time: 0.2677 data: 0.0002 max mem: 26109 Train: [0] [5700/6250] eta: 0:02:29 lr: 0.000023 grad: 0.0730 (0.0549) loss: 0.9908 (0.9933) time: 0.2681 data: 0.0002 max mem: 26109 Train: [0] [5800/6250] eta: 0:02:02 lr: 0.000023 grad: 0.0736 (0.0554) loss: 0.9892 (0.9933) time: 0.2676 data: 0.0002 max mem: 26109 Train: [0] [5900/6250] eta: 0:01:34 lr: 0.000024 grad: 0.0697 (0.0560) loss: 0.9919 (0.9932) time: 0.2683 data: 0.0002 max mem: 26109 Train: [0] [6000/6250] eta: 0:01:07 lr: 0.000024 grad: 0.0845 (0.0564) loss: 0.9898 (0.9932) time: 0.2677 data: 0.0002 max mem: 26109 Train: [0] [6100/6250] eta: 0:00:40 lr: 0.000024 grad: 0.0712 (0.0569) loss: 0.9905 (0.9931) time: 0.2676 data: 0.0002 max mem: 26109 Train: [0] [6200/6250] eta: 0:00:13 lr: 0.000025 grad: 0.0710 (0.0574) loss: 0.9898 (0.9930) time: 0.2678 data: 0.0002 max mem: 26109 Train: [0] [6249/6250] eta: 0:00:00 lr: 0.000025 grad: 0.0812 (0.0575) loss: 0.9884 (0.9930) time: 0.2680 data: 0.0002 max mem: 26109 Train: [0] Total time: 0:28:18 (0.2717 s / it) Averaged stats: lr: 0.000025 grad: 0.0812 (0.0575) loss: 0.9884 (0.9930) Eval (hcp-train-subset): [0] [ 0/62] eta: 0:04:53 loss: 0.9894 (0.9894) time: 4.7340 data: 4.6488 max mem: 26109 Eval (hcp-train-subset): [0] [61/62] eta: 0:00:00 loss: 0.9917 (0.9908) time: 0.1183 data: 0.0350 max mem: 26109 Eval (hcp-train-subset): [0] Total time: 0:00:12 (0.1961 s / it) Averaged stats (hcp-train-subset): loss: 0.9917 (0.9908) Making plots (hcp-train-subset): example=17 Eval (hcp-val): [0] [ 0/62] eta: 0:04:39 loss: 0.9843 (0.9843) time: 4.5043 data: 4.4184 max mem: 26109 Eval (hcp-val): [0] [61/62] eta: 0:00:00 loss: 0.9889 (0.9897) time: 0.1214 data: 0.0385 max mem: 26109 Eval (hcp-val): [0] Total time: 0:00:11 (0.1925 s / it) Averaged stats (hcp-val): loss: 0.9889 (0.9897) Making plots (hcp-val): example=5 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [1] [ 0/6250] eta: 9:47:54 lr: 0.000025 grad: 0.0858 (0.0858) loss: 0.9920 (0.9920) time: 5.6439 data: 5.3693 max mem: 26109 Train: [1] [ 100/6250] eta: 0:33:07 lr: 0.000025 grad: 0.0687 (0.0784) loss: 0.9915 (0.9912) time: 0.2696 data: 0.0002 max mem: 26109 Train: [1] [ 200/6250] eta: 0:29:50 lr: 0.000026 grad: 0.0629 (0.0722) loss: 0.9916 (0.9914) time: 0.2682 data: 0.0002 max mem: 26109 Train: [1] [ 300/6250] eta: 0:28:29 lr: 0.000026 grad: 0.0776 (0.0729) loss: 0.9891 (0.9910) time: 0.2714 data: 0.0002 max mem: 26109 Train: [1] [ 400/6250] eta: 0:27:34 lr: 0.000027 grad: 0.0827 (0.0741) loss: 0.9899 (0.9906) time: 0.2703 data: 0.0002 max mem: 26109 Train: [1] [ 500/6250] eta: 0:26:50 lr: 0.000027 grad: 0.0725 (0.0743) loss: 0.9905 (0.9905) time: 0.2702 data: 0.0002 max mem: 26109 Train: [1] [ 600/6250] eta: 0:26:11 lr: 0.000027 grad: 0.0626 (0.0748) loss: 0.9891 (0.9903) time: 0.2683 data: 0.0002 max mem: 26109 Train: [1] [ 700/6250] eta: 0:25:37 lr: 0.000028 grad: 0.0637 (0.0750) loss: 0.9896 (0.9902) time: 0.2698 data: 0.0002 max mem: 26109 Train: [1] [ 800/6250] eta: 0:25:20 lr: 0.000028 grad: 0.0683 (0.0744) loss: 0.9908 (0.9902) time: 0.2734 data: 0.0003 max mem: 26109 Train: [1] [ 900/6250] eta: 0:24:52 lr: 0.000029 grad: 0.0760 (0.0748) loss: 0.9896 (0.9902) time: 0.2792 data: 0.0002 max mem: 26109 Train: [1] [1000/6250] eta: 0:24:22 lr: 0.000029 grad: 0.0699 (0.0748) loss: 0.9895 (0.9901) time: 0.2704 data: 0.0002 max mem: 26109 Train: [1] [1100/6250] eta: 0:24:08 lr: 0.000029 grad: 0.0803 (0.0748) loss: 0.9891 (0.9900) time: 0.4123 data: 0.1366 max mem: 26109 Train: [1] [1200/6250] eta: 0:23:42 lr: 0.000030 grad: 0.0729 (0.0755) loss: 0.9897 (0.9900) time: 0.2684 data: 0.0002 max mem: 26109 Train: [1] [1300/6250] eta: 0:23:09 lr: 0.000030 grad: 0.0721 (0.0757) loss: 0.9888 (0.9900) time: 0.2679 data: 0.0002 max mem: 26109 Train: [1] [1400/6250] eta: 0:22:36 lr: 0.000031 grad: 0.0777 (0.0762) loss: 0.9886 (0.9899) time: 0.2692 data: 0.0002 max mem: 26109 Train: [1] [1500/6250] eta: 0:22:05 lr: 0.000031 grad: 0.0769 (0.0767) loss: 0.9901 (0.9899) time: 0.2673 data: 0.0002 max mem: 26109 Train: [1] [1600/6250] eta: 0:21:34 lr: 0.000031 grad: 0.0756 (0.0771) loss: 0.9891 (0.9898) time: 0.2697 data: 0.0002 max mem: 26109 Train: [1] [1700/6250] eta: 0:21:04 lr: 0.000032 grad: 0.0692 (0.0776) loss: 0.9896 (0.9898) time: 0.2692 data: 0.0002 max mem: 26109 Train: [1] [1800/6250] eta: 0:20:39 lr: 0.000032 grad: 0.0666 (0.0778) loss: 0.9905 (0.9897) time: 0.2685 data: 0.0002 max mem: 26109 Train: [1] [1900/6250] eta: 0:20:17 lr: 0.000033 grad: 0.0683 (0.0779) loss: 0.9887 (0.9897) time: 0.2673 data: 0.0002 max mem: 26109 Train: [1] [2000/6250] eta: 0:19:47 lr: 0.000033 grad: 0.0934 (0.0782) loss: 0.9887 (0.9896) time: 0.2676 data: 0.0002 max mem: 26109 Train: [1] [2100/6250] eta: 0:19:17 lr: 0.000033 grad: 0.0738 (0.0783) loss: 0.9895 (0.9896) time: 0.2679 data: 0.0002 max mem: 26109 Train: [1] [2200/6250] eta: 0:18:47 lr: 0.000034 grad: 0.0770 (0.0784) loss: 0.9885 (0.9895) time: 0.2709 data: 0.0002 max mem: 26109 Train: [1] [2300/6250] eta: 0:18:18 lr: 0.000034 grad: 0.0802 (0.0787) loss: 0.9879 (0.9895) time: 0.2675 data: 0.0002 max mem: 26109 Train: [1] [2400/6250] eta: 0:17:48 lr: 0.000035 grad: 0.0745 (0.0790) loss: 0.9884 (0.9894) time: 0.2707 data: 0.0002 max mem: 26109 Train: [1] [2500/6250] eta: 0:17:19 lr: 0.000035 grad: 0.0855 (0.0789) loss: 0.9890 (0.9894) time: 0.2674 data: 0.0002 max mem: 26109 Train: [1] [2600/6250] eta: 0:16:50 lr: 0.000035 grad: 0.0722 (0.0790) loss: 0.9882 (0.9894) time: 0.2683 data: 0.0002 max mem: 26109 Train: [1] [2700/6250] eta: 0:16:21 lr: 0.000036 grad: 0.0756 (0.0790) loss: 0.9882 (0.9893) time: 0.2683 data: 0.0001 max mem: 26109 Train: [1] [2800/6250] eta: 0:15:53 lr: 0.000036 grad: 0.0785 (0.0794) loss: 0.9879 (0.9893) time: 0.2687 data: 0.0002 max mem: 26109 Train: [1] [2900/6250] eta: 0:15:24 lr: 0.000037 grad: 0.0788 (0.0794) loss: 0.9891 (0.9892) time: 0.2668 data: 0.0002 max mem: 26109 Train: [1] [3000/6250] eta: 0:14:56 lr: 0.000037 grad: 0.0705 (0.0796) loss: 0.9888 (0.9892) time: 0.2677 data: 0.0002 max mem: 26109 Train: [1] [3100/6250] eta: 0:14:27 lr: 0.000037 grad: 0.0760 (0.0797) loss: 0.9885 (0.9892) time: 0.2687 data: 0.0002 max mem: 26109 Train: [1] [3200/6250] eta: 0:13:59 lr: 0.000038 grad: 0.0861 (0.0799) loss: 0.9871 (0.9891) time: 0.2684 data: 0.0002 max mem: 26109 Train: [1] [3300/6250] eta: 0:13:31 lr: 0.000038 grad: 0.0871 (0.0801) loss: 0.9888 (0.9891) time: 0.2681 data: 0.0002 max mem: 26109 Train: [1] [3400/6250] eta: 0:13:03 lr: 0.000039 grad: 0.0676 (0.0801) loss: 0.9890 (0.9891) time: 0.2693 data: 0.0002 max mem: 26109 Train: [1] [3500/6250] eta: 0:12:35 lr: 0.000039 grad: 0.0769 (0.0800) loss: 0.9877 (0.9891) time: 0.2755 data: 0.0030 max mem: 26109 Train: [1] [3600/6250] eta: 0:12:08 lr: 0.000039 grad: 0.0756 (0.0801) loss: 0.9889 (0.9891) time: 0.2684 data: 0.0001 max mem: 26109 Train: [1] [3700/6250] eta: 0:11:40 lr: 0.000040 grad: 0.0814 (0.0801) loss: 0.9877 (0.9890) time: 0.2673 data: 0.0002 max mem: 26109 Train: [1] [3800/6250] eta: 0:11:12 lr: 0.000040 grad: 0.0643 (0.0800) loss: 0.9882 (0.9890) time: 0.2673 data: 0.0002 max mem: 26109 Train: [1] [3900/6250] eta: 0:10:44 lr: 0.000041 grad: 0.0639 (0.0799) loss: 0.9874 (0.9890) time: 0.2686 data: 0.0002 max mem: 26109 Train: [1] [4000/6250] eta: 0:10:16 lr: 0.000041 grad: 0.0721 (0.0798) loss: 0.9871 (0.9890) time: 0.2681 data: 0.0002 max mem: 26109 Train: [1] [4100/6250] eta: 0:09:48 lr: 0.000041 grad: 0.0671 (0.0797) loss: 0.9879 (0.9890) time: 0.2674 data: 0.0002 max mem: 26109 Train: [1] [4200/6250] eta: 0:09:21 lr: 0.000042 grad: 0.0593 (0.0795) loss: 0.9875 (0.9889) time: 0.2700 data: 0.0002 max mem: 26109 Train: [1] [4300/6250] eta: 0:08:53 lr: 0.000042 grad: 0.0657 (0.0794) loss: 0.9889 (0.9889) time: 0.2677 data: 0.0002 max mem: 26109 Train: [1] [4400/6250] eta: 0:08:26 lr: 0.000043 grad: 0.0696 (0.0794) loss: 0.9885 (0.9889) time: 0.2698 data: 0.0002 max mem: 26109 Train: [1] [4500/6250] eta: 0:07:58 lr: 0.000043 grad: 0.0738 (0.0793) loss: 0.9873 (0.9889) time: 0.2684 data: 0.0002 max mem: 26109 Train: [1] [4600/6250] eta: 0:07:31 lr: 0.000043 grad: 0.0784 (0.0792) loss: 0.9871 (0.9889) time: 0.2679 data: 0.0002 max mem: 26109 Train: [1] [4700/6250] eta: 0:07:03 lr: 0.000044 grad: 0.0800 (0.0792) loss: 0.9884 (0.9888) time: 0.2694 data: 0.0002 max mem: 26109 Train: [1] [4800/6250] eta: 0:06:36 lr: 0.000044 grad: 0.0776 (0.0791) loss: 0.9865 (0.9888) time: 0.2675 data: 0.0002 max mem: 26109 Train: [1] [4900/6250] eta: 0:06:08 lr: 0.000045 grad: 0.0820 (0.0791) loss: 0.9855 (0.9888) time: 0.2683 data: 0.0002 max mem: 26109 Train: [1] [5000/6250] eta: 0:05:41 lr: 0.000045 grad: 0.0759 (0.0792) loss: 0.9883 (0.9887) time: 0.2694 data: 0.0002 max mem: 26109 Train: [1] [5100/6250] eta: 0:05:13 lr: 0.000045 grad: 0.0682 (0.0792) loss: 0.9888 (0.9887) time: 0.2692 data: 0.0002 max mem: 26109 Train: [1] [5200/6250] eta: 0:04:46 lr: 0.000046 grad: 0.0786 (0.0791) loss: 0.9861 (0.9887) time: 0.2684 data: 0.0002 max mem: 26109 Train: [1] [5300/6250] eta: 0:04:19 lr: 0.000046 grad: 0.0719 (0.0792) loss: 0.9876 (0.9886) time: 0.2689 data: 0.0002 max mem: 26109 Train: [1] [5400/6250] eta: 0:03:51 lr: 0.000047 grad: 0.0711 (0.0793) loss: 0.9881 (0.9886) time: 0.2672 data: 0.0002 max mem: 26109 Train: [1] [5500/6250] eta: 0:03:24 lr: 0.000047 grad: 0.0809 (0.0793) loss: 0.9879 (0.9886) time: 0.2688 data: 0.0001 max mem: 26109 Train: [1] [5600/6250] eta: 0:02:57 lr: 0.000047 grad: 0.0767 (0.0794) loss: 0.9862 (0.9886) time: 0.2694 data: 0.0002 max mem: 26109 Train: [1] [5700/6250] eta: 0:02:29 lr: 0.000048 grad: 0.0791 (0.0794) loss: 0.9854 (0.9885) time: 0.2679 data: 0.0002 max mem: 26109 Train: [1] [5800/6250] eta: 0:02:02 lr: 0.000048 grad: 0.0706 (0.0794) loss: 0.9864 (0.9885) time: 0.2671 data: 0.0002 max mem: 26109 Train: [1] [5900/6250] eta: 0:01:35 lr: 0.000049 grad: 0.0791 (0.0794) loss: 0.9863 (0.9885) time: 0.2699 data: 0.0002 max mem: 26109 Train: [1] [6000/6250] eta: 0:01:08 lr: 0.000049 grad: 0.0807 (0.0794) loss: 0.9860 (0.9884) time: 0.2680 data: 0.0002 max mem: 26109 Train: [1] [6100/6250] eta: 0:00:40 lr: 0.000049 grad: 0.0833 (0.0795) loss: 0.9876 (0.9884) time: 0.2702 data: 0.0002 max mem: 26109 Train: [1] [6200/6250] eta: 0:00:13 lr: 0.000050 grad: 0.0808 (0.0795) loss: 0.9858 (0.9884) time: 0.2679 data: 0.0002 max mem: 26109 Train: [1] [6249/6250] eta: 0:00:00 lr: 0.000050 grad: 0.0756 (0.0795) loss: 0.9866 (0.9884) time: 0.2683 data: 0.0002 max mem: 26109 Train: [1] Total time: 0:28:27 (0.2732 s / it) Averaged stats: lr: 0.000050 grad: 0.0756 (0.0795) loss: 0.9866 (0.9884) Eval (hcp-train-subset): [1] [ 0/62] eta: 0:03:15 loss: 0.9839 (0.9839) time: 3.1531 data: 3.0361 max mem: 26109 Eval (hcp-train-subset): [1] [61/62] eta: 0:00:00 loss: 0.9901 (0.9893) time: 0.1396 data: 0.0552 max mem: 26109 Eval (hcp-train-subset): [1] Total time: 0:00:13 (0.2161 s / it) Averaged stats (hcp-train-subset): loss: 0.9901 (0.9893) Making plots (hcp-train-subset): example=49 Eval (hcp-val): [1] [ 0/62] eta: 0:04:26 loss: 0.9851 (0.9851) time: 4.2949 data: 4.1707 max mem: 26109 Eval (hcp-val): [1] [61/62] eta: 0:00:00 loss: 0.9880 (0.9878) time: 0.1266 data: 0.0436 max mem: 26109 Eval (hcp-val): [1] Total time: 0:00:12 (0.2055 s / it) Averaged stats (hcp-val): loss: 0.9880 (0.9878) Making plots (hcp-val): example=25 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [2] [ 0/6250] eta: 8:42:34 lr: 0.000050 grad: 0.0762 (0.0762) loss: 0.9951 (0.9951) time: 5.0168 data: 4.6043 max mem: 26109 Train: [2] [ 100/6250] eta: 0:34:22 lr: 0.000050 grad: 0.0675 (0.0737) loss: 0.9883 (0.9896) time: 0.2707 data: 0.0002 max mem: 26109 Train: [2] [ 200/6250] eta: 0:30:34 lr: 0.000051 grad: 0.0645 (0.0717) loss: 0.9889 (0.9892) time: 0.2715 data: 0.0002 max mem: 26109 Train: [2] [ 300/6250] eta: 0:28:59 lr: 0.000051 grad: 0.0769 (0.0729) loss: 0.9883 (0.9888) time: 0.2717 data: 0.0002 max mem: 26109 Train: [2] [ 400/6250] eta: 0:28:01 lr: 0.000052 grad: 0.0709 (0.0740) loss: 0.9890 (0.9884) time: 0.2709 data: 0.0004 max mem: 26109 Train: [2] [ 500/6250] eta: 0:27:13 lr: 0.000052 grad: 0.0822 (0.0750) loss: 0.9859 (0.9880) time: 0.2699 data: 0.0002 max mem: 26109 Train: [2] [ 600/6250] eta: 0:26:31 lr: 0.000052 grad: 0.1074 (0.0773) loss: 0.9849 (0.9876) time: 0.2689 data: 0.0002 max mem: 26109 Train: [2] [ 700/6250] eta: 0:25:56 lr: 0.000053 grad: 0.0793 (0.0790) loss: 0.9859 (0.9872) time: 0.2758 data: 0.0003 max mem: 26109 Train: [2] [ 800/6250] eta: 0:25:21 lr: 0.000053 grad: 0.0788 (0.0804) loss: 0.9856 (0.9870) time: 0.2718 data: 0.0002 max mem: 26109 Train: [2] [ 900/6250] eta: 0:24:48 lr: 0.000054 grad: 0.0881 (0.0809) loss: 0.9868 (0.9868) time: 0.2677 data: 0.0002 max mem: 26109 Train: [2] [1000/6250] eta: 0:24:16 lr: 0.000054 grad: 0.0777 (0.0818) loss: 0.9864 (0.9867) time: 0.2717 data: 0.0002 max mem: 26109 Train: [2] [1100/6250] eta: 0:23:45 lr: 0.000054 grad: 0.0960 (0.0830) loss: 0.9867 (0.9866) time: 0.2691 data: 0.0002 max mem: 26109 Train: [2] [1200/6250] eta: 0:23:14 lr: 0.000055 grad: 0.0778 (0.0836) loss: 0.9873 (0.9865) time: 0.2706 data: 0.0002 max mem: 26109 Train: [2] [1300/6250] eta: 0:22:44 lr: 0.000055 grad: 0.1060 (0.0845) loss: 0.9851 (0.9864) time: 0.2702 data: 0.0002 max mem: 26109 Train: [2] [1400/6250] eta: 0:22:15 lr: 0.000056 grad: 0.0728 (0.0846) loss: 0.9889 (0.9864) time: 0.2700 data: 0.0002 max mem: 26110 Train: [2] [1500/6250] eta: 0:21:45 lr: 0.000056 grad: 0.0744 (0.0850) loss: 0.9849 (0.9864) time: 0.2682 data: 0.0002 max mem: 26110 Train: [2] [1600/6250] eta: 0:21:16 lr: 0.000056 grad: 0.0823 (0.0852) loss: 0.9844 (0.9863) time: 0.2696 data: 0.0002 max mem: 26110 Train: [2] [1700/6250] eta: 0:20:48 lr: 0.000057 grad: 0.0902 (0.0857) loss: 0.9881 (0.9863) time: 0.2722 data: 0.0002 max mem: 26110 Train: [2] [1800/6250] eta: 0:20:20 lr: 0.000057 grad: 0.0846 (0.0859) loss: 0.9856 (0.9862) time: 0.2690 data: 0.0002 max mem: 26110 Train: [2] [1900/6250] eta: 0:19:51 lr: 0.000058 grad: 0.0843 (0.0863) loss: 0.9855 (0.9862) time: 0.2708 data: 0.0002 max mem: 26110 Train: [2] [2000/6250] eta: 0:19:23 lr: 0.000058 grad: 0.0833 (0.0863) loss: 0.9848 (0.9861) time: 0.2700 data: 0.0002 max mem: 26110 Train: [2] [2100/6250] eta: 0:18:55 lr: 0.000058 grad: 0.0891 (0.0868) loss: 0.9849 (0.9861) time: 0.2701 data: 0.0002 max mem: 26110 Train: [2] [2200/6250] eta: 0:18:27 lr: 0.000059 grad: 0.0771 (0.0871) loss: 0.9818 (0.9860) time: 0.2701 data: 0.0002 max mem: 26110 Train: [2] [2300/6250] eta: 0:17:59 lr: 0.000059 grad: 0.0817 (0.0874) loss: 0.9849 (0.9859) time: 0.2695 data: 0.0002 max mem: 26110 Train: [2] [2400/6250] eta: 0:17:31 lr: 0.000060 grad: 0.0947 (0.0885) loss: 0.9849 (0.9859) time: 0.2694 data: 0.0002 max mem: 26110 Train: [2] [2500/6250] eta: 0:17:03 lr: 0.000060 grad: 0.0938 (0.0889) loss: 0.9853 (0.9858) time: 0.2692 data: 0.0002 max mem: 26110 Train: [2] [2600/6250] eta: 0:16:35 lr: 0.000060 grad: 0.0862 (0.0902) loss: 0.9835 (0.9858) time: 0.2712 data: 0.0002 max mem: 26110 Train: [2] [2700/6250] eta: 0:16:08 lr: 0.000061 grad: 0.1211 (0.0920) loss: 0.9855 (0.9857) time: 0.2697 data: 0.0002 max mem: 26110 Train: [2] [2800/6250] eta: 0:15:40 lr: 0.000061 grad: 0.1551 (0.0943) loss: 0.9841 (0.9857) time: 0.2706 data: 0.0002 max mem: 26110 Train: [2] [2900/6250] eta: 0:15:13 lr: 0.000062 grad: 0.1230 (0.0970) loss: 0.9831 (0.9856) time: 0.2687 data: 0.0002 max mem: 26110 Train: [2] [3000/6250] eta: 0:14:45 lr: 0.000062 grad: 0.2307 (0.1017) loss: 0.9837 (0.9855) time: 0.2688 data: 0.0002 max mem: 26110 Train: [2] [3100/6250] eta: 0:14:18 lr: 0.000062 grad: 0.1516 (0.1055) loss: 0.9845 (0.9855) time: 0.2696 data: 0.0002 max mem: 26110 Train: [2] [3200/6250] eta: 0:13:50 lr: 0.000063 grad: 0.1336 (0.1099) loss: 0.9812 (0.9854) time: 0.2705 data: 0.0002 max mem: 26110 Train: [2] [3300/6250] eta: 0:13:23 lr: 0.000063 grad: 0.2516 (0.1161) loss: 0.9826 (0.9853) time: 0.2686 data: 0.0002 max mem: 26110 Train: [2] [3400/6250] eta: 0:12:55 lr: 0.000064 grad: 0.2072 (0.1214) loss: 0.9846 (0.9853) time: 0.2695 data: 0.0001 max mem: 26110 Train: [2] [3500/6250] eta: 0:12:28 lr: 0.000064 grad: 0.1772 (0.1265) loss: 0.9822 (0.9852) time: 0.2733 data: 0.0002 max mem: 26110 Train: [2] [3600/6250] eta: 0:12:01 lr: 0.000064 grad: 0.2627 (0.1318) loss: 0.9823 (0.9851) time: 0.2704 data: 0.0002 max mem: 26110 Train: [2] [3700/6250] eta: 0:11:33 lr: 0.000065 grad: 0.2471 (0.1371) loss: 0.9818 (0.9850) time: 0.2699 data: 0.0002 max mem: 26110 Train: [2] [3800/6250] eta: 0:11:06 lr: 0.000065 grad: 0.3020 (0.1428) loss: 0.9822 (0.9849) time: 0.2718 data: 0.0002 max mem: 26110 Train: [2] [3900/6250] eta: 0:10:39 lr: 0.000066 grad: 0.2026 (0.1463) loss: 0.9801 (0.9849) time: 0.2693 data: 0.0002 max mem: 26110 Train: [2] [4000/6250] eta: 0:10:11 lr: 0.000066 grad: 0.2315 (0.1504) loss: 0.9814 (0.9848) time: 0.2696 data: 0.0002 max mem: 26110 Train: [2] [4100/6250] eta: 0:09:44 lr: 0.000066 grad: 0.2624 (0.1539) loss: 0.9803 (0.9847) time: 0.2714 data: 0.0002 max mem: 26110 Train: [2] [4200/6250] eta: 0:09:17 lr: 0.000067 grad: 0.2620 (0.1564) loss: 0.9816 (0.9846) time: 0.2683 data: 0.0002 max mem: 26110 Train: [2] [4300/6250] eta: 0:08:50 lr: 0.000067 grad: 0.3585 (0.1593) loss: 0.9818 (0.9846) time: 0.2683 data: 0.0002 max mem: 26110 Train: [2] [4400/6250] eta: 0:08:23 lr: 0.000068 grad: 0.2511 (0.1629) loss: 0.9815 (0.9845) time: 0.2692 data: 0.0002 max mem: 26110 Train: [2] [4500/6250] eta: 0:07:55 lr: 0.000068 grad: 0.2963 (0.1656) loss: 0.9795 (0.9844) time: 0.2677 data: 0.0002 max mem: 26110 Train: [2] [4600/6250] eta: 0:07:28 lr: 0.000068 grad: 0.2889 (0.1678) loss: 0.9826 (0.9843) time: 0.2691 data: 0.0002 max mem: 26110 Train: [2] [4700/6250] eta: 0:07:01 lr: 0.000069 grad: 0.2283 (0.1694) loss: 0.9808 (0.9843) time: 0.2681 data: 0.0002 max mem: 26110 Train: [2] [4800/6250] eta: 0:06:33 lr: 0.000069 grad: 0.1988 (0.1717) loss: 0.9826 (0.9842) time: 0.2692 data: 0.0002 max mem: 26110 Train: [2] [4900/6250] eta: 0:06:06 lr: 0.000070 grad: 0.1871 (0.1729) loss: 0.9809 (0.9842) time: 0.2685 data: 0.0002 max mem: 26110 Train: [2] [5000/6250] eta: 0:05:39 lr: 0.000070 grad: 0.1754 (0.1753) loss: 0.9795 (0.9841) time: 0.2752 data: 0.0002 max mem: 26110 Train: [2] [5100/6250] eta: 0:05:12 lr: 0.000070 grad: 0.2308 (0.1774) loss: 0.9803 (0.9840) time: 0.2685 data: 0.0002 max mem: 26110 Train: [2] [5200/6250] eta: 0:04:45 lr: 0.000071 grad: 0.2696 (0.1797) loss: 0.9818 (0.9840) time: 0.2671 data: 0.0002 max mem: 26110 Train: [2] [5300/6250] eta: 0:04:17 lr: 0.000071 grad: 0.2332 (0.1818) loss: 0.9801 (0.9839) time: 0.2672 data: 0.0001 max mem: 26110 Train: [2] [5400/6250] eta: 0:03:50 lr: 0.000072 grad: 0.1745 (0.1835) loss: 0.9807 (0.9839) time: 0.2678 data: 0.0002 max mem: 26110 Train: [2] [5500/6250] eta: 0:03:23 lr: 0.000072 grad: 0.1721 (0.1845) loss: 0.9797 (0.9838) time: 0.2673 data: 0.0002 max mem: 26110 Train: [2] [5600/6250] eta: 0:02:56 lr: 0.000072 grad: 0.2637 (0.1859) loss: 0.9803 (0.9837) time: 0.2676 data: 0.0002 max mem: 26110 Train: [2] [5700/6250] eta: 0:02:29 lr: 0.000073 grad: 0.1561 (0.1886) loss: 0.9798 (0.9837) time: 0.2694 data: 0.0002 max mem: 26110 Train: [2] [5800/6250] eta: 0:02:02 lr: 0.000073 grad: 0.2680 (0.1895) loss: 0.9793 (0.9836) time: 0.2697 data: 0.0002 max mem: 26110 Train: [2] [5900/6250] eta: 0:01:34 lr: 0.000074 grad: 0.4017 (0.1910) loss: 0.9810 (0.9835) time: 0.2680 data: 0.0002 max mem: 26110 Train: [2] [6000/6250] eta: 0:01:07 lr: 0.000074 grad: 0.1476 (0.1918) loss: 0.9795 (0.9834) time: 0.2682 data: 0.0002 max mem: 26110 Train: [2] [6100/6250] eta: 0:00:40 lr: 0.000074 grad: 0.1772 (0.1927) loss: 0.9804 (0.9834) time: 0.2697 data: 0.0002 max mem: 26110 Train: [2] [6200/6250] eta: 0:00:13 lr: 0.000075 grad: 0.2078 (0.1941) loss: 0.9794 (0.9833) time: 0.2680 data: 0.0002 max mem: 26110 Train: [2] [6249/6250] eta: 0:00:00 lr: 0.000075 grad: 0.1658 (0.1945) loss: 0.9779 (0.9833) time: 0.2690 data: 0.0002 max mem: 26110 Train: [2] Total time: 0:28:19 (0.2720 s / it) Averaged stats: lr: 0.000075 grad: 0.1658 (0.1945) loss: 0.9779 (0.9833) Eval (hcp-train-subset): [2] [ 0/62] eta: 0:04:25 loss: 0.9791 (0.9791) time: 4.2828 data: 4.1992 max mem: 26110 Eval (hcp-train-subset): [2] [61/62] eta: 0:00:00 loss: 0.9816 (0.9799) time: 0.1280 data: 0.0434 max mem: 26110 Eval (hcp-train-subset): [2] Total time: 0:00:12 (0.2072 s / it) Averaged stats (hcp-train-subset): loss: 0.9816 (0.9799) Making plots (hcp-train-subset): example=46 Eval (hcp-val): [2] [ 0/62] eta: 0:03:19 loss: 0.9721 (0.9721) time: 3.2143 data: 3.0906 max mem: 26110 Eval (hcp-val): [2] [61/62] eta: 0:00:00 loss: 0.9772 (0.9769) time: 0.1113 data: 0.0285 max mem: 26110 Eval (hcp-val): [2] Total time: 0:00:12 (0.1997 s / it) Averaged stats (hcp-val): loss: 0.9772 (0.9769) Making plots (hcp-val): example=30 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [3] [ 0/6250] eta: 6:14:22 lr: 0.000075 grad: 0.0815 (0.0815) loss: 0.9825 (0.9825) time: 3.5939 data: 3.2303 max mem: 26110 Train: [3] [ 100/6250] eta: 0:32:32 lr: 0.000075 grad: 0.2767 (0.2848) loss: 0.9815 (0.9798) time: 0.2708 data: 0.0002 max mem: 26110 Train: [3] [ 200/6250] eta: 0:29:41 lr: 0.000076 grad: 0.1993 (0.2744) loss: 0.9803 (0.9791) time: 0.2693 data: 0.0001 max mem: 26110 Train: [3] [ 300/6250] eta: 0:28:24 lr: 0.000076 grad: 0.1408 (0.2740) loss: 0.9809 (0.9788) time: 0.2700 data: 0.0002 max mem: 26110 Train: [3] [ 400/6250] eta: 0:27:31 lr: 0.000077 grad: 0.3522 (0.2826) loss: 0.9796 (0.9786) time: 0.2694 data: 0.0002 max mem: 26110 Train: [3] [ 500/6250] eta: 0:26:51 lr: 0.000077 grad: 0.3251 (0.2828) loss: 0.9806 (0.9784) time: 0.2717 data: 0.0002 max mem: 26110 Train: [3] [ 600/6250] eta: 0:26:15 lr: 0.000077 grad: 0.2116 (0.2876) loss: 0.9780 (0.9780) time: 0.2723 data: 0.0003 max mem: 26110 Train: [3] [ 700/6250] eta: 0:25:43 lr: 0.000078 grad: 0.2537 (0.2855) loss: 0.9802 (0.9780) time: 0.2711 data: 0.0002 max mem: 26110 Train: [3] [ 800/6250] eta: 0:25:10 lr: 0.000078 grad: 0.1260 (0.2852) loss: 0.9766 (0.9777) time: 0.2743 data: 0.0003 max mem: 26110 Train: [3] [ 900/6250] eta: 0:24:39 lr: 0.000079 grad: 0.2361 (0.2809) loss: 0.9790 (0.9776) time: 0.2690 data: 0.0001 max mem: 26110 Train: [3] [1000/6250] eta: 0:24:10 lr: 0.000079 grad: 0.1771 (0.2764) loss: 0.9790 (0.9775) time: 0.2700 data: 0.0002 max mem: 26110 Train: [3] [1100/6250] eta: 0:23:40 lr: 0.000079 grad: 0.1845 (0.2733) loss: 0.9792 (0.9774) time: 0.2695 data: 0.0002 max mem: 26110 Train: [3] [1200/6250] eta: 0:23:10 lr: 0.000080 grad: 0.2347 (0.2750) loss: 0.9760 (0.9772) time: 0.2718 data: 0.0002 max mem: 26110 Train: [3] [1300/6250] eta: 0:22:41 lr: 0.000080 grad: 0.3489 (0.2761) loss: 0.9781 (0.9771) time: 0.2693 data: 0.0002 max mem: 26110 Train: [3] [1400/6250] eta: 0:22:12 lr: 0.000081 grad: 0.2259 (0.2768) loss: 0.9760 (0.9770) time: 0.2684 data: 0.0002 max mem: 26110 Train: [3] [1500/6250] eta: 0:21:44 lr: 0.000081 grad: 0.2334 (0.2746) loss: 0.9747 (0.9769) time: 0.2778 data: 0.0002 max mem: 26110 Train: [3] [1600/6250] eta: 0:21:16 lr: 0.000081 grad: 0.2666 (0.2750) loss: 0.9765 (0.9768) time: 0.2731 data: 0.0002 max mem: 26110 Train: [3] [1700/6250] eta: 0:20:47 lr: 0.000082 grad: 0.2504 (0.2762) loss: 0.9775 (0.9768) time: 0.2695 data: 0.0002 max mem: 26110 Train: [3] [1800/6250] eta: 0:20:19 lr: 0.000082 grad: 0.3191 (0.2762) loss: 0.9790 (0.9767) time: 0.2696 data: 0.0002 max mem: 26110 Train: [3] [1900/6250] eta: 0:19:51 lr: 0.000083 grad: 0.3110 (0.2758) loss: 0.9754 (0.9766) time: 0.2699 data: 0.0002 max mem: 26110 Train: [3] [2000/6250] eta: 0:19:23 lr: 0.000083 grad: 0.1918 (0.2734) loss: 0.9781 (0.9766) time: 0.2690 data: 0.0002 max mem: 26110 Train: [3] [2100/6250] eta: 0:18:54 lr: 0.000083 grad: 0.1766 (0.2736) loss: 0.9781 (0.9765) time: 0.2703 data: 0.0002 max mem: 26110 Train: [3] [2200/6250] eta: 0:18:26 lr: 0.000084 grad: 0.1644 (0.2721) loss: 0.9742 (0.9764) time: 0.2689 data: 0.0002 max mem: 26110 Train: [3] [2300/6250] eta: 0:17:59 lr: 0.000084 grad: 0.2724 (0.2721) loss: 0.9748 (0.9763) time: 0.2698 data: 0.0002 max mem: 26110 Train: [3] [2400/6250] eta: 0:17:31 lr: 0.000085 grad: 0.1686 (0.2715) loss: 0.9743 (0.9762) time: 0.2690 data: 0.0002 max mem: 26110 Train: [3] [2500/6250] eta: 0:17:03 lr: 0.000085 grad: 0.1825 (0.2720) loss: 0.9717 (0.9761) time: 0.2706 data: 0.0002 max mem: 26110 Train: [3] [2600/6250] eta: 0:16:35 lr: 0.000085 grad: 0.1430 (0.2715) loss: 0.9712 (0.9760) time: 0.2717 data: 0.0002 max mem: 26110 Train: [3] [2700/6250] eta: 0:16:08 lr: 0.000086 grad: 0.2504 (0.2706) loss: 0.9730 (0.9758) time: 0.2692 data: 0.0001 max mem: 26110 Train: [3] [2800/6250] eta: 0:15:40 lr: 0.000086 grad: 0.2433 (0.2715) loss: 0.9716 (0.9757) time: 0.2691 data: 0.0002 max mem: 26110 Train: [3] [2900/6250] eta: 0:15:13 lr: 0.000087 grad: 0.2025 (0.2720) loss: 0.9710 (0.9755) time: 0.2702 data: 0.0001 max mem: 26110 Train: [3] [3000/6250] eta: 0:14:45 lr: 0.000087 grad: 0.1586 (0.2711) loss: 0.9716 (0.9754) time: 0.2693 data: 0.0002 max mem: 26110 Train: [3] [3100/6250] eta: 0:14:18 lr: 0.000087 grad: 0.1873 (0.2695) loss: 0.9708 (0.9753) time: 0.2698 data: 0.0002 max mem: 26110 Train: [3] [3200/6250] eta: 0:13:50 lr: 0.000088 grad: 0.2377 (0.2701) loss: 0.9688 (0.9751) time: 0.2695 data: 0.0002 max mem: 26110 Train: [3] [3300/6250] eta: 0:13:23 lr: 0.000088 grad: 0.2537 (0.2695) loss: 0.9699 (0.9749) time: 0.2715 data: 0.0002 max mem: 26110 Train: [3] [3400/6250] eta: 0:12:55 lr: 0.000089 grad: 0.1920 (0.2688) loss: 0.9683 (0.9748) time: 0.2683 data: 0.0001 max mem: 26110 Train: [3] [3500/6250] eta: 0:12:28 lr: 0.000089 grad: 0.3256 (0.2686) loss: 0.9705 (0.9746) time: 0.2692 data: 0.0001 max mem: 26110 Train: [3] [3600/6250] eta: 0:12:01 lr: 0.000089 grad: 0.1778 (0.2695) loss: 0.9684 (0.9745) time: 0.2731 data: 0.0002 max mem: 26110 Train: [3] [3700/6250] eta: 0:11:33 lr: 0.000090 grad: 0.1577 (0.2689) loss: 0.9691 (0.9744) time: 0.2696 data: 0.0002 max mem: 26110 Train: [3] [3800/6250] eta: 0:11:06 lr: 0.000090 grad: 0.1752 (0.2683) loss: 0.9666 (0.9742) time: 0.2701 data: 0.0002 max mem: 26110 Train: [3] [3900/6250] eta: 0:10:39 lr: 0.000091 grad: 0.2748 (0.2686) loss: 0.9665 (0.9740) time: 0.2731 data: 0.0002 max mem: 26110 Train: [3] [4000/6250] eta: 0:10:11 lr: 0.000091 grad: 0.1670 (0.2688) loss: 0.9644 (0.9738) time: 0.2703 data: 0.0002 max mem: 26110 Train: [3] [4100/6250] eta: 0:09:44 lr: 0.000091 grad: 0.2044 (0.2679) loss: 0.9643 (0.9735) time: 0.2720 data: 0.0002 max mem: 26110 Train: [3] [4200/6250] eta: 0:09:18 lr: 0.000092 grad: 0.2079 (0.2677) loss: 0.9610 (0.9732) time: 0.2698 data: 0.0002 max mem: 26110 Train: [3] [4300/6250] eta: 0:08:50 lr: 0.000092 grad: 0.2563 (0.2686) loss: 0.9620 (0.9729) time: 0.2689 data: 0.0002 max mem: 26110 Train: [3] [4400/6250] eta: 0:08:23 lr: 0.000093 grad: 0.3087 (0.2689) loss: 0.9609 (0.9726) time: 0.2687 data: 0.0002 max mem: 26110 Train: [3] [4500/6250] eta: 0:07:56 lr: 0.000093 grad: 0.3407 (0.2695) loss: 0.9617 (0.9723) time: 0.2695 data: 0.0002 max mem: 26110 Train: [3] [4600/6250] eta: 0:07:28 lr: 0.000093 grad: 0.3007 (0.2706) loss: 0.9592 (0.9720) time: 0.2690 data: 0.0002 max mem: 26110 Train: [3] [4700/6250] eta: 0:07:01 lr: 0.000094 grad: 0.2101 (0.2711) loss: 0.9576 (0.9717) time: 0.2693 data: 0.0002 max mem: 26110 Train: [3] [4800/6250] eta: 0:06:34 lr: 0.000094 grad: 0.3212 (0.2719) loss: 0.9553 (0.9714) time: 0.2726 data: 0.0002 max mem: 26110 Train: [3] [4900/6250] eta: 0:06:07 lr: 0.000095 grad: 0.3142 (0.2723) loss: 0.9526 (0.9710) time: 0.2694 data: 0.0002 max mem: 26110 Train: [3] [5000/6250] eta: 0:05:39 lr: 0.000095 grad: 0.3365 (0.2734) loss: 0.9532 (0.9706) time: 0.2700 data: 0.0002 max mem: 26110 Train: [3] [5100/6250] eta: 0:05:12 lr: 0.000095 grad: 0.2748 (0.2738) loss: 0.9517 (0.9703) time: 0.2692 data: 0.0002 max mem: 26110 Train: [3] [5200/6250] eta: 0:04:45 lr: 0.000096 grad: 0.2685 (0.2743) loss: 0.9497 (0.9699) time: 0.2696 data: 0.0001 max mem: 26110 Train: [3] [5300/6250] eta: 0:04:18 lr: 0.000096 grad: 0.3572 (0.2753) loss: 0.9533 (0.9695) time: 0.2712 data: 0.0002 max mem: 26110 Train: [3] [5400/6250] eta: 0:03:51 lr: 0.000097 grad: 0.3000 (0.2760) loss: 0.9496 (0.9691) time: 0.2705 data: 0.0002 max mem: 26110 Train: [3] [5500/6250] eta: 0:03:23 lr: 0.000097 grad: 0.2749 (0.2762) loss: 0.9466 (0.9687) time: 0.2702 data: 0.0001 max mem: 26110 Train: [3] [5600/6250] eta: 0:02:56 lr: 0.000097 grad: 0.3545 (0.2769) loss: 0.9480 (0.9683) time: 0.2693 data: 0.0002 max mem: 26110 Train: [3] [5700/6250] eta: 0:02:29 lr: 0.000098 grad: 0.3418 (0.2783) loss: 0.9427 (0.9678) time: 0.2699 data: 0.0001 max mem: 26110 Train: [3] [5800/6250] eta: 0:02:02 lr: 0.000098 grad: 0.3611 (0.2794) loss: 0.9413 (0.9674) time: 0.2688 data: 0.0002 max mem: 26110 Train: [3] [5900/6250] eta: 0:01:35 lr: 0.000099 grad: 0.2490 (0.2800) loss: 0.9374 (0.9669) time: 0.2697 data: 0.0002 max mem: 26110 Train: [3] [6000/6250] eta: 0:01:08 lr: 0.000099 grad: 0.3172 (0.2808) loss: 0.9368 (0.9665) time: 0.2694 data: 0.0002 max mem: 26110 Train: [3] [6100/6250] eta: 0:00:40 lr: 0.000099 grad: 0.3907 (0.2818) loss: 0.9413 (0.9660) time: 0.2689 data: 0.0002 max mem: 26157 Train: [3] [6200/6250] eta: 0:00:13 lr: 0.000100 grad: 0.3175 (0.2834) loss: 0.9368 (0.9655) time: 0.2698 data: 0.0002 max mem: 26157 Train: [3] [6249/6250] eta: 0:00:00 lr: 0.000100 grad: 0.3701 (0.2840) loss: 0.9369 (0.9653) time: 0.2689 data: 0.0002 max mem: 26157 Train: [3] Total time: 0:28:29 (0.2735 s / it) Averaged stats: lr: 0.000100 grad: 0.3701 (0.2840) loss: 0.9369 (0.9653) Eval (hcp-train-subset): [3] [ 0/62] eta: 0:03:33 loss: 0.9426 (0.9426) time: 3.4422 data: 3.3165 max mem: 26157 Eval (hcp-train-subset): [3] [61/62] eta: 0:00:00 loss: 0.9401 (0.9384) time: 0.1138 data: 0.0290 max mem: 26157 Eval (hcp-train-subset): [3] Total time: 0:00:12 (0.2022 s / it) Averaged stats (hcp-train-subset): loss: 0.9401 (0.9384) Making plots (hcp-train-subset): example=52 Eval (hcp-val): [3] [ 0/62] eta: 0:03:02 loss: 0.9323 (0.9323) time: 2.9358 data: 2.8099 max mem: 26157 Eval (hcp-val): [3] [61/62] eta: 0:00:00 loss: 0.9349 (0.9344) time: 0.1198 data: 0.0367 max mem: 26157 Eval (hcp-val): [3] Total time: 0:00:12 (0.2093 s / it) Averaged stats (hcp-val): loss: 0.9349 (0.9344) Making plots (hcp-val): example=33 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [4] [ 0/6250] eta: 10:54:53 lr: 0.000100 grad: 0.4930 (0.4930) loss: 0.9342 (0.9342) time: 6.2870 data: 6.0115 max mem: 26157 Train: [4] [ 100/6250] eta: 0:35:02 lr: 0.000100 grad: 0.3176 (0.3722) loss: 0.9426 (0.9375) time: 0.2726 data: 0.0002 max mem: 26157 Train: [4] [ 200/6250] eta: 0:30:53 lr: 0.000101 grad: 0.2440 (0.3436) loss: 0.9386 (0.9367) time: 0.2710 data: 0.0002 max mem: 26157 Train: [4] [ 300/6250] eta: 0:29:11 lr: 0.000101 grad: 0.3442 (0.3450) loss: 0.9342 (0.9359) time: 0.2695 data: 0.0002 max mem: 26157 Train: [4] [ 400/6250] eta: 0:28:06 lr: 0.000102 grad: 0.4009 (0.3433) loss: 0.9298 (0.9349) time: 0.2706 data: 0.0002 max mem: 26157 Train: [4] [ 500/6250] eta: 0:27:18 lr: 0.000102 grad: 0.3295 (0.3430) loss: 0.9322 (0.9343) time: 0.2736 data: 0.0002 max mem: 26157 Train: [4] [ 600/6250] eta: 0:26:37 lr: 0.000102 grad: 0.3349 (0.3445) loss: 0.9285 (0.9337) time: 0.2697 data: 0.0002 max mem: 26157 Train: [4] [ 700/6250] eta: 0:26:00 lr: 0.000103 grad: 0.2766 (0.3442) loss: 0.9270 (0.9331) time: 0.2741 data: 0.0003 max mem: 26157 Train: [4] [ 800/6250] eta: 0:25:25 lr: 0.000103 grad: 0.2476 (0.3428) loss: 0.9313 (0.9328) time: 0.2717 data: 0.0002 max mem: 26157 Train: [4] [ 900/6250] eta: 0:24:52 lr: 0.000104 grad: 0.3016 (0.3415) loss: 0.9273 (0.9323) time: 0.2705 data: 0.0002 max mem: 26157 Train: [4] [1000/6250] eta: 0:24:20 lr: 0.000104 grad: 0.2983 (0.3404) loss: 0.9276 (0.9318) time: 0.2708 data: 0.0002 max mem: 26157 Train: [4] [1100/6250] eta: 0:23:48 lr: 0.000104 grad: 0.3702 (0.3421) loss: 0.9272 (0.9314) time: 0.2714 data: 0.0002 max mem: 26157 Train: [4] [1200/6250] eta: 0:23:17 lr: 0.000105 grad: 0.3041 (0.3437) loss: 0.9290 (0.9311) time: 0.2691 data: 0.0002 max mem: 26157 Train: [4] [1300/6250] eta: 0:22:47 lr: 0.000105 grad: 0.2478 (0.3406) loss: 0.9272 (0.9306) time: 0.2713 data: 0.0002 max mem: 26157 Train: [4] [1400/6250] eta: 0:22:17 lr: 0.000106 grad: 0.3095 (0.3384) loss: 0.9250 (0.9303) time: 0.2699 data: 0.0002 max mem: 26157 Train: [4] [1500/6250] eta: 0:21:48 lr: 0.000106 grad: 0.2952 (0.3390) loss: 0.9237 (0.9299) time: 0.2703 data: 0.0002 max mem: 26157 Train: [4] [1600/6250] eta: 0:21:19 lr: 0.000106 grad: 0.3573 (0.3388) loss: 0.9236 (0.9296) time: 0.2699 data: 0.0002 max mem: 26157 Train: [4] [1700/6250] eta: 0:20:50 lr: 0.000107 grad: 0.3191 (0.3393) loss: 0.9238 (0.9293) time: 0.2703 data: 0.0002 max mem: 26157 Train: [4] [1800/6250] eta: 0:20:22 lr: 0.000107 grad: 0.2943 (0.3393) loss: 0.9267 (0.9291) time: 0.2718 data: 0.0002 max mem: 26157 Train: [4] [1900/6250] eta: 0:19:54 lr: 0.000108 grad: 0.2244 (0.3371) loss: 0.9227 (0.9289) time: 0.2749 data: 0.0002 max mem: 26157 Train: [4] [2000/6250] eta: 0:19:26 lr: 0.000108 grad: 0.2634 (0.3360) loss: 0.9223 (0.9286) time: 0.2695 data: 0.0002 max mem: 26157 Train: [4] [2100/6250] eta: 0:18:58 lr: 0.000108 grad: 0.2574 (0.3362) loss: 0.9236 (0.9283) time: 0.2702 data: 0.0001 max mem: 26157 Train: [4] [2200/6250] eta: 0:18:30 lr: 0.000109 grad: 0.3094 (0.3353) loss: 0.9264 (0.9280) time: 0.2701 data: 0.0002 max mem: 26157 Train: [4] [2300/6250] eta: 0:18:02 lr: 0.000109 grad: 0.2654 (0.3356) loss: 0.9210 (0.9278) time: 0.2699 data: 0.0002 max mem: 26157 Train: [4] [2400/6250] eta: 0:17:37 lr: 0.000110 grad: 0.2588 (0.3350) loss: 0.9196 (0.9276) time: 0.2712 data: 0.0002 max mem: 26157 Train: [4] [2500/6250] eta: 0:17:09 lr: 0.000110 grad: 0.2461 (0.3341) loss: 0.9192 (0.9273) time: 0.2699 data: 0.0002 max mem: 26157 Train: [4] [2600/6250] eta: 0:16:41 lr: 0.000110 grad: 0.3244 (0.3345) loss: 0.9230 (0.9271) time: 0.2686 data: 0.0002 max mem: 26157 Train: [4] [2700/6250] eta: 0:16:13 lr: 0.000111 grad: 0.3775 (0.3349) loss: 0.9234 (0.9268) time: 0.2687 data: 0.0002 max mem: 26157 Train: [4] [2800/6250] eta: 0:15:44 lr: 0.000111 grad: 0.3044 (0.3339) loss: 0.9222 (0.9266) time: 0.2706 data: 0.0002 max mem: 26157 Train: [4] [2900/6250] eta: 0:15:16 lr: 0.000112 grad: 0.3170 (0.3330) loss: 0.9192 (0.9264) time: 0.2685 data: 0.0001 max mem: 26157 Train: [4] [3000/6250] eta: 0:14:48 lr: 0.000112 grad: 0.2305 (0.3328) loss: 0.9179 (0.9262) time: 0.2689 data: 0.0002 max mem: 26157 Train: [4] [3100/6250] eta: 0:14:21 lr: 0.000112 grad: 0.2680 (0.3319) loss: 0.9173 (0.9259) time: 0.2700 data: 0.0002 max mem: 26157 Train: [4] [3200/6250] eta: 0:13:53 lr: 0.000113 grad: 0.2466 (0.3304) loss: 0.9164 (0.9257) time: 0.2677 data: 0.0002 max mem: 26157 Train: [4] [3300/6250] eta: 0:13:25 lr: 0.000113 grad: 0.2484 (0.3294) loss: 0.9184 (0.9255) time: 0.2701 data: 0.0002 max mem: 26157 Train: [4] [3400/6250] eta: 0:12:58 lr: 0.000114 grad: 0.2573 (0.3286) loss: 0.9176 (0.9253) time: 0.2700 data: 0.0002 max mem: 26157 Train: [4] [3500/6250] eta: 0:12:30 lr: 0.000114 grad: 0.2669 (0.3277) loss: 0.9156 (0.9250) time: 0.2686 data: 0.0002 max mem: 26157 Train: [4] [3600/6250] eta: 0:12:03 lr: 0.000114 grad: 0.2784 (0.3273) loss: 0.9150 (0.9248) time: 0.2696 data: 0.0002 max mem: 26157 Train: [4] [3700/6250] eta: 0:11:35 lr: 0.000115 grad: 0.3003 (0.3271) loss: 0.9114 (0.9246) time: 0.2685 data: 0.0002 max mem: 26157 Train: [4] [3800/6250] eta: 0:11:07 lr: 0.000115 grad: 0.3616 (0.3266) loss: 0.9188 (0.9244) time: 0.2681 data: 0.0002 max mem: 26157 Train: [4] [3900/6250] eta: 0:10:40 lr: 0.000116 grad: 0.3130 (0.3256) loss: 0.9163 (0.9241) time: 0.2678 data: 0.0002 max mem: 26157 Train: [4] [4000/6250] eta: 0:10:13 lr: 0.000116 grad: 0.2573 (0.3245) loss: 0.9138 (0.9239) time: 0.2685 data: 0.0002 max mem: 26157 Train: [4] [4100/6250] eta: 0:09:46 lr: 0.000116 grad: 0.2459 (0.3237) loss: 0.9140 (0.9236) time: 0.2702 data: 0.0002 max mem: 26157 Train: [4] [4200/6250] eta: 0:09:18 lr: 0.000117 grad: 0.2672 (0.3231) loss: 0.9136 (0.9234) time: 0.2713 data: 0.0002 max mem: 26157 Train: [4] [4300/6250] eta: 0:08:51 lr: 0.000117 grad: 0.2435 (0.3223) loss: 0.9166 (0.9232) time: 0.2703 data: 0.0002 max mem: 26157 Train: [4] [4400/6250] eta: 0:08:24 lr: 0.000118 grad: 0.2322 (0.3223) loss: 0.9143 (0.9231) time: 0.2685 data: 0.0002 max mem: 26157 Train: [4] [4500/6250] eta: 0:07:56 lr: 0.000118 grad: 0.2934 (0.3223) loss: 0.9132 (0.9229) time: 0.2691 data: 0.0003 max mem: 26157 Train: [4] [4600/6250] eta: 0:07:29 lr: 0.000118 grad: 0.2788 (0.3213) loss: 0.9151 (0.9227) time: 0.2682 data: 0.0002 max mem: 26157 Train: [4] [4700/6250] eta: 0:07:01 lr: 0.000119 grad: 0.3466 (0.3207) loss: 0.9176 (0.9225) time: 0.2681 data: 0.0002 max mem: 26157 Train: [4] [4800/6250] eta: 0:06:34 lr: 0.000119 grad: 0.2863 (0.3197) loss: 0.9109 (0.9223) time: 0.2715 data: 0.0002 max mem: 26157 Train: [4] [4900/6250] eta: 0:06:07 lr: 0.000120 grad: 0.2392 (0.3195) loss: 0.9145 (0.9221) time: 0.2701 data: 0.0002 max mem: 26157 Train: [4] [5000/6250] eta: 0:05:40 lr: 0.000120 grad: 0.3337 (0.3196) loss: 0.9181 (0.9220) time: 0.2703 data: 0.0002 max mem: 26157 Train: [4] [5100/6250] eta: 0:05:12 lr: 0.000120 grad: 0.2476 (0.3186) loss: 0.9132 (0.9218) time: 0.2698 data: 0.0002 max mem: 26157 Train: [4] [5200/6250] eta: 0:04:45 lr: 0.000121 grad: 0.3021 (0.3181) loss: 0.9133 (0.9216) time: 0.2685 data: 0.0002 max mem: 26157 Train: [4] [5300/6250] eta: 0:04:18 lr: 0.000121 grad: 0.2428 (0.3172) loss: 0.9104 (0.9214) time: 0.2700 data: 0.0002 max mem: 26157 Train: [4] [5400/6250] eta: 0:03:51 lr: 0.000122 grad: 0.2788 (0.3168) loss: 0.9098 (0.9213) time: 0.2731 data: 0.0002 max mem: 26157 Train: [4] [5500/6250] eta: 0:03:23 lr: 0.000122 grad: 0.3624 (0.3160) loss: 0.9127 (0.9211) time: 0.2699 data: 0.0002 max mem: 26157 Train: [4] [5600/6250] eta: 0:02:56 lr: 0.000122 grad: 0.2094 (0.3154) loss: 0.9116 (0.9209) time: 0.2699 data: 0.0002 max mem: 26157 Train: [4] [5700/6250] eta: 0:02:29 lr: 0.000123 grad: 0.2660 (0.3144) loss: 0.9104 (0.9207) time: 0.2713 data: 0.0002 max mem: 26157 Train: [4] [5800/6250] eta: 0:02:02 lr: 0.000123 grad: 0.2497 (0.3136) loss: 0.9107 (0.9205) time: 0.2694 data: 0.0002 max mem: 26157 Train: [4] [5900/6250] eta: 0:01:35 lr: 0.000124 grad: 0.2111 (0.3131) loss: 0.9084 (0.9204) time: 0.2727 data: 0.0002 max mem: 26157 Train: [4] [6000/6250] eta: 0:01:08 lr: 0.000124 grad: 0.2697 (0.3123) loss: 0.9097 (0.9203) time: 0.2728 data: 0.0002 max mem: 26157 Train: [4] [6100/6250] eta: 0:00:40 lr: 0.000124 grad: 0.2316 (0.3113) loss: 0.9118 (0.9201) time: 0.2699 data: 0.0002 max mem: 26157 Train: [4] [6200/6250] eta: 0:00:13 lr: 0.000125 grad: 0.2301 (0.3103) loss: 0.9098 (0.9199) time: 0.2710 data: 0.0002 max mem: 26157 Train: [4] [6249/6250] eta: 0:00:00 lr: 0.000125 grad: 0.2027 (0.3098) loss: 0.9123 (0.9199) time: 0.2710 data: 0.0002 max mem: 26157 Train: [4] Total time: 0:28:25 (0.2729 s / it) Averaged stats: lr: 0.000125 grad: 0.2027 (0.3098) loss: 0.9123 (0.9199) Eval (hcp-train-subset): [4] [ 0/62] eta: 0:04:36 loss: 0.9200 (0.9200) time: 4.4615 data: 4.3773 max mem: 26157 Eval (hcp-train-subset): [4] [61/62] eta: 0:00:00 loss: 0.9149 (0.9142) time: 0.1307 data: 0.0479 max mem: 26157 Eval (hcp-train-subset): [4] Total time: 0:00:13 (0.2102 s / it) Averaged stats (hcp-train-subset): loss: 0.9149 (0.9142) Making plots (hcp-train-subset): example=5 Eval (hcp-val): [4] [ 0/62] eta: 0:04:56 loss: 0.9055 (0.9055) time: 4.7811 data: 4.6969 max mem: 26157 Eval (hcp-val): [4] [61/62] eta: 0:00:00 loss: 0.9096 (0.9095) time: 0.1395 data: 0.0569 max mem: 26157 Eval (hcp-val): [4] Total time: 0:00:13 (0.2138 s / it) Averaged stats (hcp-val): loss: 0.9096 (0.9095) Making plots (hcp-val): example=26 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [5] [ 0/6250] eta: 7:46:06 lr: 0.000125 grad: 0.2023 (0.2023) loss: 0.9408 (0.9408) time: 4.4747 data: 4.1715 max mem: 26157 Train: [5] [ 100/6250] eta: 0:33:37 lr: 0.000125 grad: 0.1894 (0.2282) loss: 0.9209 (0.9188) time: 0.2702 data: 0.0002 max mem: 26157 Train: [5] [ 200/6250] eta: 0:30:09 lr: 0.000125 grad: 0.2084 (0.2424) loss: 0.9167 (0.9158) time: 0.2689 data: 0.0002 max mem: 26157 Train: [5] [ 300/6250] eta: 0:28:40 lr: 0.000125 grad: 0.2713 (0.2478) loss: 0.9078 (0.9142) time: 0.2683 data: 0.0002 max mem: 26157 Train: [5] [ 400/6250] eta: 0:27:42 lr: 0.000125 grad: 0.2210 (0.2482) loss: 0.9047 (0.9133) time: 0.2691 data: 0.0002 max mem: 26157 Train: [5] [ 500/6250] eta: 0:26:57 lr: 0.000125 grad: 0.2520 (0.2517) loss: 0.9104 (0.9124) time: 0.2694 data: 0.0002 max mem: 26157 Train: [5] [ 600/6250] eta: 0:26:20 lr: 0.000125 grad: 0.2976 (0.2526) loss: 0.9021 (0.9115) time: 0.2728 data: 0.0002 max mem: 26157 Train: [5] [ 700/6250] eta: 0:25:46 lr: 0.000125 grad: 0.2477 (0.2544) loss: 0.9034 (0.9105) time: 0.2732 data: 0.0003 max mem: 26157 Train: [5] [ 800/6250] eta: 0:25:12 lr: 0.000125 grad: 0.2779 (0.2527) loss: 0.9069 (0.9101) time: 0.2685 data: 0.0002 max mem: 26157 Train: [5] [ 900/6250] eta: 0:24:39 lr: 0.000125 grad: 0.1818 (0.2491) loss: 0.9071 (0.9098) time: 0.2678 data: 0.0001 max mem: 26157 Train: [5] [1000/6250] eta: 0:24:07 lr: 0.000125 grad: 0.2165 (0.2474) loss: 0.9075 (0.9096) time: 0.2687 data: 0.0003 max mem: 26157 Train: [5] [1100/6250] eta: 0:23:36 lr: 0.000125 grad: 0.2313 (0.2441) loss: 0.9088 (0.9094) time: 0.2688 data: 0.0002 max mem: 26157 Train: [5] [1200/6250] eta: 0:23:06 lr: 0.000125 grad: 0.2476 (0.2446) loss: 0.9117 (0.9094) time: 0.2682 data: 0.0002 max mem: 26157 Train: [5] [1300/6250] eta: 0:22:37 lr: 0.000125 grad: 0.1916 (0.2425) loss: 0.9068 (0.9091) time: 0.2695 data: 0.0002 max mem: 26157 Train: [5] [1400/6250] eta: 0:22:07 lr: 0.000125 grad: 0.1607 (0.2413) loss: 0.9046 (0.9088) time: 0.2677 data: 0.0002 max mem: 26157 Train: [5] [1500/6250] eta: 0:21:38 lr: 0.000125 grad: 0.2400 (0.2428) loss: 0.9036 (0.9085) time: 0.2683 data: 0.0002 max mem: 26157 Train: [5] [1600/6250] eta: 0:21:09 lr: 0.000125 grad: 0.2060 (0.2416) loss: 0.9043 (0.9083) time: 0.2678 data: 0.0001 max mem: 26157 Train: [5] [1700/6250] eta: 0:20:41 lr: 0.000125 grad: 0.1847 (0.2410) loss: 0.9022 (0.9080) time: 0.2690 data: 0.0002 max mem: 26157 Train: [5] [1800/6250] eta: 0:20:13 lr: 0.000125 grad: 0.1960 (0.2412) loss: 0.9027 (0.9079) time: 0.2688 data: 0.0002 max mem: 26157 Train: [5] [1900/6250] eta: 0:19:45 lr: 0.000125 grad: 0.2372 (0.2410) loss: 0.9019 (0.9076) time: 0.2740 data: 0.0002 max mem: 26157 Train: [5] [2000/6250] eta: 0:19:17 lr: 0.000125 grad: 0.2391 (0.2402) loss: 0.9021 (0.9074) time: 0.2723 data: 0.0002 max mem: 26157 Train: [5] [2100/6250] eta: 0:18:50 lr: 0.000125 grad: 0.2601 (0.2400) loss: 0.9038 (0.9072) time: 0.2686 data: 0.0002 max mem: 26157 Train: [5] [2200/6250] eta: 0:18:22 lr: 0.000125 grad: 0.1872 (0.2391) loss: 0.9054 (0.9070) time: 0.2688 data: 0.0002 max mem: 26157 Train: [5] [2300/6250] eta: 0:17:54 lr: 0.000125 grad: 0.2056 (0.2388) loss: 0.9062 (0.9069) time: 0.2693 data: 0.0002 max mem: 26157 Train: [5] [2400/6250] eta: 0:17:27 lr: 0.000125 grad: 0.1923 (0.2381) loss: 0.9036 (0.9067) time: 0.2722 data: 0.0002 max mem: 26157 Train: [5] [2500/6250] eta: 0:16:59 lr: 0.000125 grad: 0.2455 (0.2381) loss: 0.8997 (0.9065) time: 0.2735 data: 0.0002 max mem: 26157 Train: [5] [2600/6250] eta: 0:16:32 lr: 0.000125 grad: 0.2332 (0.2380) loss: 0.9033 (0.9064) time: 0.2731 data: 0.0002 max mem: 26157 Train: [5] [2700/6250] eta: 0:16:05 lr: 0.000125 grad: 0.2179 (0.2373) loss: 0.9005 (0.9063) time: 0.2720 data: 0.0002 max mem: 26157 Train: [5] [2800/6250] eta: 0:15:38 lr: 0.000125 grad: 0.1955 (0.2365) loss: 0.9079 (0.9062) time: 0.2753 data: 0.0002 max mem: 26157 Train: [5] [2900/6250] eta: 0:15:10 lr: 0.000125 grad: 0.1696 (0.2359) loss: 0.9016 (0.9061) time: 0.2696 data: 0.0002 max mem: 26157 Train: [5] [3000/6250] eta: 0:14:43 lr: 0.000125 grad: 0.2088 (0.2347) loss: 0.9029 (0.9060) time: 0.2695 data: 0.0002 max mem: 26157 Train: [5] [3100/6250] eta: 0:14:15 lr: 0.000125 grad: 0.2108 (0.2342) loss: 0.8977 (0.9058) time: 0.2688 data: 0.0002 max mem: 26157 Train: [5] [3200/6250] eta: 0:13:48 lr: 0.000125 grad: 0.1657 (0.2334) loss: 0.9040 (0.9057) time: 0.2684 data: 0.0002 max mem: 26157 Train: [5] [3300/6250] eta: 0:13:27 lr: 0.000125 grad: 0.1904 (0.2336) loss: 0.9011 (0.9055) time: 0.6378 data: 0.3610 max mem: 26157 Train: [5] [3400/6250] eta: 0:13:00 lr: 0.000125 grad: 0.1918 (0.2329) loss: 0.9045 (0.9054) time: 0.2690 data: 0.0002 max mem: 26157 Train: [5] [3500/6250] eta: 0:12:32 lr: 0.000125 grad: 0.1806 (0.2320) loss: 0.9036 (0.9052) time: 0.2713 data: 0.0002 max mem: 26157 Train: [5] [3600/6250] eta: 0:12:05 lr: 0.000125 grad: 0.2107 (0.2322) loss: 0.9019 (0.9050) time: 0.2683 data: 0.0002 max mem: 26157 Train: [5] [3700/6250] eta: 0:11:37 lr: 0.000125 grad: 0.1659 (0.2313) loss: 0.8990 (0.9048) time: 0.2683 data: 0.0002 max mem: 26157 Train: [5] [3800/6250] eta: 0:11:10 lr: 0.000125 grad: 0.1657 (0.2306) loss: 0.8990 (0.9047) time: 0.2690 data: 0.0002 max mem: 26157 Train: [5] [3900/6250] eta: 0:10:42 lr: 0.000125 grad: 0.1923 (0.2302) loss: 0.8996 (0.9045) time: 0.2702 data: 0.0002 max mem: 26157 Train: [5] [4000/6250] eta: 0:10:14 lr: 0.000125 grad: 0.2156 (0.2302) loss: 0.8985 (0.9044) time: 0.2688 data: 0.0002 max mem: 26157 Train: [5] [4100/6250] eta: 0:09:47 lr: 0.000125 grad: 0.1636 (0.2295) loss: 0.8930 (0.9042) time: 0.2777 data: 0.0064 max mem: 26157 Train: [5] [4200/6250] eta: 0:09:19 lr: 0.000125 grad: 0.1658 (0.2289) loss: 0.8977 (0.9041) time: 0.2692 data: 0.0002 max mem: 26157 Train: [5] [4300/6250] eta: 0:08:52 lr: 0.000125 grad: 0.1582 (0.2281) loss: 0.9014 (0.9039) time: 0.2703 data: 0.0002 max mem: 26157 Train: [5] [4400/6250] eta: 0:08:25 lr: 0.000125 grad: 0.1908 (0.2277) loss: 0.8956 (0.9038) time: 0.2702 data: 0.0002 max mem: 26157 Train: [5] [4500/6250] eta: 0:07:57 lr: 0.000125 grad: 0.2406 (0.2273) loss: 0.8964 (0.9036) time: 0.2694 data: 0.0002 max mem: 26157 Train: [5] [4600/6250] eta: 0:07:30 lr: 0.000125 grad: 0.2129 (0.2271) loss: 0.8926 (0.9034) time: 0.2679 data: 0.0001 max mem: 26157 Train: [5] [4700/6250] eta: 0:07:02 lr: 0.000125 grad: 0.1836 (0.2266) loss: 0.8928 (0.9032) time: 0.2687 data: 0.0002 max mem: 26157 Train: [5] [4800/6250] eta: 0:06:35 lr: 0.000125 grad: 0.1990 (0.2263) loss: 0.8923 (0.9030) time: 0.2687 data: 0.0002 max mem: 26157 Train: [5] [4900/6250] eta: 0:06:08 lr: 0.000125 grad: 0.1561 (0.2259) loss: 0.8916 (0.9028) time: 0.2712 data: 0.0002 max mem: 26157 Train: [5] [5000/6250] eta: 0:05:40 lr: 0.000125 grad: 0.2061 (0.2254) loss: 0.8924 (0.9026) time: 0.2723 data: 0.0002 max mem: 26157 Train: [5] [5100/6250] eta: 0:05:13 lr: 0.000125 grad: 0.1846 (0.2248) loss: 0.8920 (0.9024) time: 0.2686 data: 0.0002 max mem: 26157 Train: [5] [5200/6250] eta: 0:04:46 lr: 0.000125 grad: 0.2090 (0.2246) loss: 0.8901 (0.9021) time: 0.2678 data: 0.0002 max mem: 26157 Train: [5] [5300/6250] eta: 0:04:18 lr: 0.000125 grad: 0.1999 (0.2242) loss: 0.8887 (0.9019) time: 0.2676 data: 0.0002 max mem: 26157 Train: [5] [5400/6250] eta: 0:03:51 lr: 0.000125 grad: 0.1801 (0.2237) loss: 0.8831 (0.9016) time: 0.2679 data: 0.0002 max mem: 26157 Train: [5] [5500/6250] eta: 0:03:24 lr: 0.000125 grad: 0.2115 (0.2235) loss: 0.8904 (0.9014) time: 0.2676 data: 0.0002 max mem: 26157 Train: [5] [5600/6250] eta: 0:02:56 lr: 0.000125 grad: 0.2020 (0.2232) loss: 0.8816 (0.9011) time: 0.2715 data: 0.0002 max mem: 26157 Train: [5] [5700/6250] eta: 0:02:29 lr: 0.000125 grad: 0.1903 (0.2229) loss: 0.8836 (0.9008) time: 0.2693 data: 0.0002 max mem: 26157 Train: [5] [5800/6250] eta: 0:02:02 lr: 0.000125 grad: 0.1822 (0.2223) loss: 0.8864 (0.9006) time: 0.2700 data: 0.0002 max mem: 26157 Train: [5] [5900/6250] eta: 0:01:35 lr: 0.000125 grad: 0.2036 (0.2222) loss: 0.8838 (0.9003) time: 0.2686 data: 0.0002 max mem: 26157 Train: [5] [6000/6250] eta: 0:01:08 lr: 0.000125 grad: 0.2003 (0.2221) loss: 0.8840 (0.9000) time: 0.2719 data: 0.0002 max mem: 26157 Train: [5] [6100/6250] eta: 0:00:40 lr: 0.000125 grad: 0.1967 (0.2217) loss: 0.8809 (0.8998) time: 0.2682 data: 0.0002 max mem: 26157 Train: [5] [6200/6250] eta: 0:00:13 lr: 0.000125 grad: 0.1879 (0.2217) loss: 0.8842 (0.8995) time: 0.2683 data: 0.0001 max mem: 26157 Train: [5] [6249/6250] eta: 0:00:00 lr: 0.000125 grad: 0.1824 (0.2214) loss: 0.8819 (0.8994) time: 0.2676 data: 0.0001 max mem: 26157 Train: [5] Total time: 0:28:24 (0.2727 s / it) Averaged stats: lr: 0.000125 grad: 0.1824 (0.2214) loss: 0.8819 (0.8994) Eval (hcp-train-subset): [5] [ 0/62] eta: 0:04:40 loss: 0.8934 (0.8934) time: 4.5191 data: 4.4357 max mem: 26157 Eval (hcp-train-subset): [5] [61/62] eta: 0:00:00 loss: 0.8870 (0.8862) time: 0.1307 data: 0.0477 max mem: 26157 Eval (hcp-train-subset): [5] Total time: 0:00:13 (0.2105 s / it) Averaged stats (hcp-train-subset): loss: 0.8870 (0.8862) Making plots (hcp-train-subset): example=38 Eval (hcp-val): [5] [ 0/62] eta: 0:05:45 loss: 0.8784 (0.8784) time: 5.5769 data: 5.4928 max mem: 26157 Eval (hcp-val): [5] [61/62] eta: 0:00:00 loss: 0.8795 (0.8809) time: 0.1193 data: 0.0364 max mem: 26157 Eval (hcp-val): [5] Total time: 0:00:13 (0.2104 s / it) Averaged stats (hcp-val): loss: 0.8795 (0.8809) Making plots (hcp-val): example=4 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [6] [ 0/6250] eta: 9:00:55 lr: 0.000125 grad: 0.2315 (0.2315) loss: 0.8599 (0.8599) time: 5.1929 data: 4.8768 max mem: 26157 Train: [6] [ 100/6250] eta: 0:33:47 lr: 0.000125 grad: 0.1563 (0.1891) loss: 0.8863 (0.8847) time: 0.2718 data: 0.0002 max mem: 26157 Train: [6] [ 200/6250] eta: 0:30:18 lr: 0.000125 grad: 0.1656 (0.2004) loss: 0.8863 (0.8839) time: 0.2719 data: 0.0002 max mem: 26157 Train: [6] [ 300/6250] eta: 0:28:48 lr: 0.000125 grad: 0.1889 (0.1953) loss: 0.8819 (0.8836) time: 0.2698 data: 0.0002 max mem: 26157 Train: [6] [ 400/6250] eta: 0:27:50 lr: 0.000125 grad: 0.1798 (0.1955) loss: 0.8865 (0.8838) time: 0.2730 data: 0.0002 max mem: 26157 Train: [6] [ 500/6250] eta: 0:27:05 lr: 0.000125 grad: 0.1715 (0.1973) loss: 0.8754 (0.8834) time: 0.2701 data: 0.0002 max mem: 26157 Train: [6] [ 600/6250] eta: 0:26:25 lr: 0.000125 grad: 0.1985 (0.1990) loss: 0.8816 (0.8831) time: 0.2716 data: 0.0002 max mem: 26157 Train: [6] [ 700/6250] eta: 0:25:50 lr: 0.000125 grad: 0.1542 (0.1997) loss: 0.8746 (0.8826) time: 0.2729 data: 0.0002 max mem: 26157 Train: [6] [ 800/6250] eta: 0:25:23 lr: 0.000125 grad: 0.1975 (0.2007) loss: 0.8793 (0.8821) time: 0.3120 data: 0.0379 max mem: 26157 Train: [6] [ 900/6250] eta: 0:24:51 lr: 0.000125 grad: 0.1742 (0.1990) loss: 0.8771 (0.8819) time: 0.2723 data: 0.0002 max mem: 26157 Train: [6] [1000/6250] eta: 0:24:19 lr: 0.000125 grad: 0.1833 (0.1979) loss: 0.8808 (0.8816) time: 0.2692 data: 0.0002 max mem: 26157 Train: [6] [1100/6250] eta: 0:23:49 lr: 0.000125 grad: 0.3059 (0.1987) loss: 0.8774 (0.8813) time: 0.2692 data: 0.0002 max mem: 26157 Train: [6] [1200/6250] eta: 0:23:18 lr: 0.000125 grad: 0.1682 (0.1983) loss: 0.8806 (0.8811) time: 0.2690 data: 0.0002 max mem: 26157 Train: [6] [1300/6250] eta: 0:22:53 lr: 0.000125 grad: 0.1492 (0.1969) loss: 0.8787 (0.8811) time: 0.2727 data: 0.0002 max mem: 26157 Train: [6] [1400/6250] eta: 0:22:23 lr: 0.000125 grad: 0.2112 (0.1968) loss: 0.8757 (0.8807) time: 0.2687 data: 0.0002 max mem: 26157 Train: [6] [1500/6250] eta: 0:21:53 lr: 0.000125 grad: 0.1788 (0.1957) loss: 0.8755 (0.8805) time: 0.2695 data: 0.0002 max mem: 26157 Train: [6] [1600/6250] eta: 0:21:54 lr: 0.000125 grad: 0.1465 (0.1952) loss: 0.8719 (0.8801) time: 0.2714 data: 0.0002 max mem: 26157 Train: [6] [1700/6250] eta: 0:21:23 lr: 0.000125 grad: 0.1740 (0.1940) loss: 0.8782 (0.8800) time: 0.2720 data: 0.0002 max mem: 26157 Train: [6] [1800/6250] eta: 0:20:53 lr: 0.000125 grad: 0.1475 (0.1923) loss: 0.8742 (0.8798) time: 0.2729 data: 0.0002 max mem: 26157 Train: [6] [1900/6250] eta: 0:20:22 lr: 0.000125 grad: 0.1689 (0.1916) loss: 0.8750 (0.8795) time: 0.2698 data: 0.0002 max mem: 26157 Train: [6] [2000/6250] eta: 0:19:51 lr: 0.000125 grad: 0.1427 (0.1910) loss: 0.8773 (0.8794) time: 0.2683 data: 0.0001 max mem: 26157 Train: [6] [2100/6250] eta: 0:19:21 lr: 0.000125 grad: 0.1568 (0.1898) loss: 0.8760 (0.8792) time: 0.2687 data: 0.0002 max mem: 26157 Train: [6] [2200/6250] eta: 0:18:51 lr: 0.000125 grad: 0.1544 (0.1885) loss: 0.8718 (0.8790) time: 0.2692 data: 0.0002 max mem: 26157 Train: [6] [2300/6250] eta: 0:18:22 lr: 0.000125 grad: 0.1623 (0.1882) loss: 0.8724 (0.8787) time: 0.2685 data: 0.0002 max mem: 26157 Train: [6] [2400/6250] eta: 0:17:53 lr: 0.000125 grad: 0.1609 (0.1874) loss: 0.8709 (0.8785) time: 0.2682 data: 0.0002 max mem: 26157 Train: [6] [2500/6250] eta: 0:17:24 lr: 0.000125 grad: 0.1482 (0.1864) loss: 0.8682 (0.8783) time: 0.2699 data: 0.0002 max mem: 26157 Train: [6] [2600/6250] eta: 0:16:55 lr: 0.000125 grad: 0.1984 (0.1857) loss: 0.8749 (0.8781) time: 0.2694 data: 0.0002 max mem: 26157 Train: [6] [2700/6250] eta: 0:16:26 lr: 0.000125 grad: 0.1729 (0.1851) loss: 0.8793 (0.8780) time: 0.2686 data: 0.0002 max mem: 26157 Train: [6] [2800/6250] eta: 0:15:57 lr: 0.000125 grad: 0.1597 (0.1846) loss: 0.8690 (0.8778) time: 0.2713 data: 0.0002 max mem: 26157 Train: [6] [2900/6250] eta: 0:15:28 lr: 0.000125 grad: 0.1572 (0.1843) loss: 0.8725 (0.8776) time: 0.2698 data: 0.0002 max mem: 26157 Train: [6] [3000/6250] eta: 0:15:00 lr: 0.000125 grad: 0.1391 (0.1837) loss: 0.8729 (0.8775) time: 0.2683 data: 0.0002 max mem: 26157 Train: [6] [3100/6250] eta: 0:14:31 lr: 0.000125 grad: 0.1414 (0.1829) loss: 0.8746 (0.8773) time: 0.2713 data: 0.0002 max mem: 26157 Train: [6] [3200/6250] eta: 0:14:03 lr: 0.000125 grad: 0.1660 (0.1821) loss: 0.8722 (0.8771) time: 0.2686 data: 0.0001 max mem: 26157 Train: [6] [3300/6250] eta: 0:13:35 lr: 0.000125 grad: 0.1427 (0.1814) loss: 0.8710 (0.8769) time: 0.2682 data: 0.0002 max mem: 26157 Train: [6] [3400/6250] eta: 0:13:06 lr: 0.000125 grad: 0.1518 (0.1806) loss: 0.8715 (0.8767) time: 0.2766 data: 0.0002 max mem: 26157 Train: [6] [3500/6250] eta: 0:12:38 lr: 0.000125 grad: 0.1417 (0.1799) loss: 0.8676 (0.8765) time: 0.2699 data: 0.0002 max mem: 26157 Train: [6] [3600/6250] eta: 0:12:10 lr: 0.000125 grad: 0.1195 (0.1792) loss: 0.8706 (0.8763) time: 0.2731 data: 0.0002 max mem: 26157 Train: [6] [3700/6250] eta: 0:11:42 lr: 0.000125 grad: 0.1406 (0.1782) loss: 0.8635 (0.8761) time: 0.2695 data: 0.0002 max mem: 26157 Train: [6] [3800/6250] eta: 0:11:15 lr: 0.000125 grad: 0.1700 (0.1776) loss: 0.8648 (0.8759) time: 0.2728 data: 0.0002 max mem: 26157 Train: [6] [3900/6250] eta: 0:10:47 lr: 0.000125 grad: 0.1372 (0.1767) loss: 0.8684 (0.8758) time: 0.2695 data: 0.0002 max mem: 26157 Train: [6] [4000/6250] eta: 0:10:19 lr: 0.000125 grad: 0.1435 (0.1761) loss: 0.8699 (0.8756) time: 0.2700 data: 0.0002 max mem: 26157 Train: [6] [4100/6250] eta: 0:09:51 lr: 0.000125 grad: 0.1460 (0.1754) loss: 0.8696 (0.8755) time: 0.2691 data: 0.0002 max mem: 26157 Train: [6] [4200/6250] eta: 0:09:23 lr: 0.000125 grad: 0.1522 (0.1747) loss: 0.8713 (0.8753) time: 0.2691 data: 0.0002 max mem: 26157 Train: [6] [4300/6250] eta: 0:08:56 lr: 0.000125 grad: 0.1248 (0.1738) loss: 0.8676 (0.8751) time: 0.2703 data: 0.0002 max mem: 26157 Train: [6] [4400/6250] eta: 0:08:28 lr: 0.000125 grad: 0.1215 (0.1730) loss: 0.8715 (0.8750) time: 0.2728 data: 0.0002 max mem: 26157 Train: [6] [4500/6250] eta: 0:08:00 lr: 0.000125 grad: 0.1374 (0.1723) loss: 0.8657 (0.8748) time: 0.2682 data: 0.0002 max mem: 26157 Train: [6] [4600/6250] eta: 0:07:33 lr: 0.000125 grad: 0.1199 (0.1717) loss: 0.8633 (0.8746) time: 0.2716 data: 0.0002 max mem: 26157 Train: [6] [4700/6250] eta: 0:07:05 lr: 0.000125 grad: 0.1373 (0.1712) loss: 0.8666 (0.8745) time: 0.2699 data: 0.0001 max mem: 26157 Train: [6] [4800/6250] eta: 0:06:40 lr: 0.000125 grad: 0.1600 (0.1708) loss: 0.8684 (0.8743) time: 0.2717 data: 0.0002 max mem: 26157 Train: [6] [4900/6250] eta: 0:06:12 lr: 0.000125 grad: 0.1405 (0.1702) loss: 0.8660 (0.8741) time: 0.2704 data: 0.0002 max mem: 26157 Train: [6] [5000/6250] eta: 0:05:44 lr: 0.000125 grad: 0.1232 (0.1700) loss: 0.8624 (0.8739) time: 0.2726 data: 0.0002 max mem: 26157 Train: [6] [5100/6250] eta: 0:05:17 lr: 0.000125 grad: 0.1404 (0.1694) loss: 0.8609 (0.8737) time: 0.2692 data: 0.0002 max mem: 26157 Train: [6] [5200/6250] eta: 0:04:49 lr: 0.000125 grad: 0.1359 (0.1692) loss: 0.8625 (0.8734) time: 0.2703 data: 0.0002 max mem: 26157 Train: [6] [5300/6250] eta: 0:04:22 lr: 0.000125 grad: 0.1545 (0.1685) loss: 0.8631 (0.8732) time: 0.2722 data: 0.0002 max mem: 26157 Train: [6] [5400/6250] eta: 0:03:54 lr: 0.000125 grad: 0.1291 (0.1679) loss: 0.8611 (0.8730) time: 0.2710 data: 0.0002 max mem: 26157 Train: [6] [5500/6250] eta: 0:03:27 lr: 0.000125 grad: 0.1428 (0.1673) loss: 0.8591 (0.8728) time: 0.2716 data: 0.0002 max mem: 26157 Train: [6] [5600/6250] eta: 0:02:59 lr: 0.000125 grad: 0.1391 (0.1667) loss: 0.8606 (0.8726) time: 0.2697 data: 0.0002 max mem: 26157 Train: [6] [5700/6250] eta: 0:02:31 lr: 0.000125 grad: 0.1114 (0.1660) loss: 0.8625 (0.8725) time: 0.2697 data: 0.0002 max mem: 26157 Train: [6] [5800/6250] eta: 0:02:04 lr: 0.000125 grad: 0.1206 (0.1652) loss: 0.8641 (0.8723) time: 0.2679 data: 0.0002 max mem: 26157 Train: [6] [5900/6250] eta: 0:01:36 lr: 0.000125 grad: 0.1250 (0.1647) loss: 0.8614 (0.8721) time: 0.2689 data: 0.0002 max mem: 26157 Train: [6] [6000/6250] eta: 0:01:08 lr: 0.000125 grad: 0.1294 (0.1640) loss: 0.8616 (0.8720) time: 0.2693 data: 0.0002 max mem: 26157 Train: [6] [6100/6250] eta: 0:00:41 lr: 0.000125 grad: 0.1430 (0.1638) loss: 0.8601 (0.8718) time: 0.2713 data: 0.0002 max mem: 26157 Train: [6] [6200/6250] eta: 0:00:13 lr: 0.000125 grad: 0.1116 (0.1631) loss: 0.8605 (0.8717) time: 0.2696 data: 0.0002 max mem: 26157 Train: [6] [6249/6250] eta: 0:00:00 lr: 0.000125 grad: 0.1026 (0.1627) loss: 0.8609 (0.8716) time: 0.2685 data: 0.0002 max mem: 26157 Train: [6] Total time: 0:28:48 (0.2766 s / it) Averaged stats: lr: 0.000125 grad: 0.1026 (0.1627) loss: 0.8609 (0.8716) Eval (hcp-train-subset): [6] [ 0/62] eta: 0:04:25 loss: 0.8743 (0.8743) time: 4.2889 data: 4.2045 max mem: 26157 Eval (hcp-train-subset): [6] [61/62] eta: 0:00:00 loss: 0.8675 (0.8668) time: 0.1291 data: 0.0459 max mem: 26157 Eval (hcp-train-subset): [6] Total time: 0:00:12 (0.2081 s / it) Averaged stats (hcp-train-subset): loss: 0.8675 (0.8668) Making plots (hcp-train-subset): example=18 Eval (hcp-val): [6] [ 0/62] eta: 0:05:15 loss: 0.8591 (0.8591) time: 5.0889 data: 5.0047 max mem: 26157 Eval (hcp-val): [6] [61/62] eta: 0:00:00 loss: 0.8605 (0.8614) time: 0.1295 data: 0.0450 max mem: 26157 Eval (hcp-val): [6] Total time: 0:00:12 (0.2090 s / it) Averaged stats (hcp-val): loss: 0.8605 (0.8614) Making plots (hcp-val): example=58 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [7] [ 0/6250] eta: 10:44:37 lr: 0.000125 grad: 0.3638 (0.3638) loss: 0.8120 (0.8120) time: 6.1883 data: 5.8908 max mem: 26157 Train: [7] [ 100/6250] eta: 0:33:53 lr: 0.000125 grad: 0.1102 (0.1370) loss: 0.8551 (0.8560) time: 0.2747 data: 0.0002 max mem: 26157 Train: [7] [ 200/6250] eta: 0:30:18 lr: 0.000125 grad: 0.1280 (0.1433) loss: 0.8558 (0.8550) time: 0.2703 data: 0.0002 max mem: 26157 Train: [7] [ 300/6250] eta: 0:28:48 lr: 0.000125 grad: 0.1206 (0.1455) loss: 0.8558 (0.8547) time: 0.2709 data: 0.0003 max mem: 26157 Train: [7] [ 400/6250] eta: 0:27:49 lr: 0.000125 grad: 0.1324 (0.1441) loss: 0.8557 (0.8548) time: 0.2729 data: 0.0002 max mem: 26157 Train: [7] [ 500/6250] eta: 0:27:02 lr: 0.000125 grad: 0.1433 (0.1429) loss: 0.8623 (0.8552) time: 0.2672 data: 0.0002 max mem: 26157 Train: [7] [ 600/6250] eta: 0:26:21 lr: 0.000125 grad: 0.1215 (0.1398) loss: 0.8597 (0.8557) time: 0.2678 data: 0.0002 max mem: 26157 Train: [7] [ 700/6250] eta: 0:25:44 lr: 0.000125 grad: 0.1493 (0.1404) loss: 0.8583 (0.8561) time: 0.2702 data: 0.0002 max mem: 26157 Train: [7] [ 800/6250] eta: 0:25:10 lr: 0.000125 grad: 0.1054 (0.1389) loss: 0.8586 (0.8563) time: 0.2680 data: 0.0001 max mem: 26157 Train: [7] [ 900/6250] eta: 0:24:37 lr: 0.000125 grad: 0.1162 (0.1379) loss: 0.8565 (0.8564) time: 0.2679 data: 0.0002 max mem: 26157 Train: [7] [1000/6250] eta: 0:24:06 lr: 0.000125 grad: 0.1190 (0.1370) loss: 0.8562 (0.8565) time: 0.2684 data: 0.0002 max mem: 26157 Train: [7] [1100/6250] eta: 0:23:35 lr: 0.000125 grad: 0.1218 (0.1356) loss: 0.8572 (0.8565) time: 0.2681 data: 0.0002 max mem: 26157 Train: [7] [1200/6250] eta: 0:23:05 lr: 0.000125 grad: 0.1023 (0.1344) loss: 0.8611 (0.8566) time: 0.2685 data: 0.0002 max mem: 26157 Train: [7] [1300/6250] eta: 0:22:35 lr: 0.000125 grad: 0.1130 (0.1340) loss: 0.8587 (0.8566) time: 0.2690 data: 0.0002 max mem: 26157 Train: [7] [1400/6250] eta: 0:22:06 lr: 0.000125 grad: 0.1160 (0.1338) loss: 0.8560 (0.8567) time: 0.2700 data: 0.0002 max mem: 26157 Train: [7] [1500/6250] eta: 0:21:38 lr: 0.000125 grad: 0.1232 (0.1332) loss: 0.8586 (0.8567) time: 0.2680 data: 0.0002 max mem: 26157 Train: [7] [1600/6250] eta: 0:21:09 lr: 0.000125 grad: 0.1150 (0.1330) loss: 0.8547 (0.8567) time: 0.2710 data: 0.0002 max mem: 26157 Train: [7] [1700/6250] eta: 0:20:41 lr: 0.000125 grad: 0.1112 (0.1322) loss: 0.8547 (0.8566) time: 0.2702 data: 0.0002 max mem: 26157 Train: [7] [1800/6250] eta: 0:20:13 lr: 0.000125 grad: 0.1208 (0.1315) loss: 0.8552 (0.8566) time: 0.2697 data: 0.0002 max mem: 26157 Train: [7] [1900/6250] eta: 0:19:45 lr: 0.000125 grad: 0.1094 (0.1312) loss: 0.8559 (0.8567) time: 0.2705 data: 0.0011 max mem: 26157 Train: [7] [2000/6250] eta: 0:19:18 lr: 0.000125 grad: 0.1353 (0.1311) loss: 0.8546 (0.8566) time: 0.2711 data: 0.0002 max mem: 26157 Train: [7] [2100/6250] eta: 0:18:50 lr: 0.000125 grad: 0.1104 (0.1308) loss: 0.8520 (0.8565) time: 0.2776 data: 0.0002 max mem: 26157 Train: [7] [2200/6250] eta: 0:18:23 lr: 0.000125 grad: 0.1176 (0.1306) loss: 0.8588 (0.8564) time: 0.2845 data: 0.0003 max mem: 26157 Train: [7] [2300/6250] eta: 0:18:02 lr: 0.000125 grad: 0.1348 (0.1305) loss: 0.8532 (0.8564) time: 0.2690 data: 0.0002 max mem: 26157 Train: [7] [2400/6250] eta: 0:17:34 lr: 0.000125 grad: 0.1156 (0.1304) loss: 0.8543 (0.8563) time: 0.2687 data: 0.0002 max mem: 26157 Train: [7] [2500/6250] eta: 0:17:06 lr: 0.000125 grad: 0.1248 (0.1299) loss: 0.8550 (0.8562) time: 0.2702 data: 0.0002 max mem: 26157 Train: [7] [2600/6250] eta: 0:16:38 lr: 0.000125 grad: 0.1130 (0.1293) loss: 0.8581 (0.8563) time: 0.2687 data: 0.0001 max mem: 26157 Train: [7] [2700/6250] eta: 0:16:10 lr: 0.000125 grad: 0.1117 (0.1288) loss: 0.8534 (0.8562) time: 0.2686 data: 0.0001 max mem: 26157 Train: [7] [2800/6250] eta: 0:15:42 lr: 0.000125 grad: 0.1058 (0.1285) loss: 0.8550 (0.8562) time: 0.2688 data: 0.0002 max mem: 26157 Train: [7] [2900/6250] eta: 0:15:14 lr: 0.000125 grad: 0.1163 (0.1282) loss: 0.8584 (0.8562) time: 0.2692 data: 0.0002 max mem: 26157 Train: [7] [3000/6250] eta: 0:14:46 lr: 0.000125 grad: 0.1041 (0.1278) loss: 0.8569 (0.8562) time: 0.2686 data: 0.0002 max mem: 26157 Train: [7] [3100/6250] eta: 0:14:19 lr: 0.000125 grad: 0.0986 (0.1272) loss: 0.8558 (0.8562) time: 0.2685 data: 0.0002 max mem: 26157 Train: [7] [3200/6250] eta: 0:13:51 lr: 0.000125 grad: 0.1154 (0.1268) loss: 0.8559 (0.8562) time: 0.2691 data: 0.0002 max mem: 26157 Train: [7] [3300/6250] eta: 0:13:23 lr: 0.000125 grad: 0.0989 (0.1264) loss: 0.8579 (0.8562) time: 0.2684 data: 0.0002 max mem: 26157 Train: [7] [3400/6250] eta: 0:12:56 lr: 0.000125 grad: 0.0955 (0.1259) loss: 0.8525 (0.8562) time: 0.2691 data: 0.0002 max mem: 26157 Train: [7] [3500/6250] eta: 0:12:29 lr: 0.000125 grad: 0.0935 (0.1256) loss: 0.8559 (0.8562) time: 0.3128 data: 0.0002 max mem: 26157 Train: [7] [3600/6250] eta: 0:12:01 lr: 0.000125 grad: 0.1058 (0.1250) loss: 0.8543 (0.8562) time: 0.2677 data: 0.0002 max mem: 26157 Train: [7] [3700/6250] eta: 0:11:34 lr: 0.000125 grad: 0.1133 (0.1246) loss: 0.8590 (0.8562) time: 0.2689 data: 0.0002 max mem: 26157 Train: [7] [3800/6250] eta: 0:11:06 lr: 0.000125 grad: 0.1027 (0.1242) loss: 0.8566 (0.8562) time: 0.2680 data: 0.0002 max mem: 26157 Train: [7] [3900/6250] eta: 0:10:39 lr: 0.000125 grad: 0.1058 (0.1239) loss: 0.8563 (0.8562) time: 0.2683 data: 0.0002 max mem: 26157 Train: [7] [4000/6250] eta: 0:10:12 lr: 0.000125 grad: 0.0948 (0.1234) loss: 0.8572 (0.8562) time: 0.2674 data: 0.0001 max mem: 26157 Train: [7] [4100/6250] eta: 0:09:44 lr: 0.000125 grad: 0.0957 (0.1229) loss: 0.8594 (0.8562) time: 0.2695 data: 0.0002 max mem: 26157 Train: [7] [4200/6250] eta: 0:09:17 lr: 0.000125 grad: 0.1097 (0.1226) loss: 0.8556 (0.8561) time: 0.2695 data: 0.0002 max mem: 26157 Train: [7] [4300/6250] eta: 0:08:50 lr: 0.000125 grad: 0.0947 (0.1222) loss: 0.8534 (0.8561) time: 0.2732 data: 0.0002 max mem: 26157 Train: [7] [4400/6250] eta: 0:08:22 lr: 0.000125 grad: 0.0963 (0.1218) loss: 0.8576 (0.8561) time: 0.2687 data: 0.0002 max mem: 26157 Train: [7] [4500/6250] eta: 0:07:55 lr: 0.000125 grad: 0.1126 (0.1216) loss: 0.8562 (0.8561) time: 0.2696 data: 0.0002 max mem: 26157 Train: [7] [4600/6250] eta: 0:07:28 lr: 0.000125 grad: 0.0931 (0.1214) loss: 0.8571 (0.8561) time: 0.2769 data: 0.0003 max mem: 26157 Train: [7] [4700/6250] eta: 0:07:01 lr: 0.000125 grad: 0.1003 (0.1210) loss: 0.8565 (0.8561) time: 0.2704 data: 0.0002 max mem: 26157 Train: [7] [4800/6250] eta: 0:06:34 lr: 0.000125 grad: 0.1087 (0.1207) loss: 0.8531 (0.8562) time: 0.2723 data: 0.0002 max mem: 26157 Train: [7] [4900/6250] eta: 0:06:06 lr: 0.000125 grad: 0.1041 (0.1204) loss: 0.8578 (0.8561) time: 0.2749 data: 0.0002 max mem: 26157 Train: [7] [5000/6250] eta: 0:05:39 lr: 0.000125 grad: 0.0958 (0.1200) loss: 0.8506 (0.8561) time: 0.2693 data: 0.0002 max mem: 26157 Train: [7] [5100/6250] eta: 0:05:12 lr: 0.000125 grad: 0.0881 (0.1196) loss: 0.8564 (0.8561) time: 0.2737 data: 0.0002 max mem: 26157 Train: [7] [5200/6250] eta: 0:04:45 lr: 0.000125 grad: 0.0995 (0.1193) loss: 0.8541 (0.8560) time: 0.2679 data: 0.0002 max mem: 26157 Train: [7] [5300/6250] eta: 0:04:18 lr: 0.000125 grad: 0.0876 (0.1189) loss: 0.8542 (0.8560) time: 0.2675 data: 0.0001 max mem: 26157 Train: [7] [5400/6250] eta: 0:03:50 lr: 0.000125 grad: 0.0973 (0.1186) loss: 0.8571 (0.8560) time: 0.2696 data: 0.0002 max mem: 26157 Train: [7] [5500/6250] eta: 0:03:23 lr: 0.000125 grad: 0.0934 (0.1183) loss: 0.8558 (0.8560) time: 0.2706 data: 0.0002 max mem: 26157 Train: [7] [5600/6250] eta: 0:02:56 lr: 0.000125 grad: 0.1005 (0.1181) loss: 0.8524 (0.8560) time: 0.2718 data: 0.0002 max mem: 26157 Train: [7] [5700/6250] eta: 0:02:29 lr: 0.000125 grad: 0.0869 (0.1177) loss: 0.8567 (0.8559) time: 0.2687 data: 0.0002 max mem: 26157 Train: [7] [5800/6250] eta: 0:02:02 lr: 0.000125 grad: 0.0968 (0.1176) loss: 0.8556 (0.8559) time: 0.2672 data: 0.0002 max mem: 26157 Train: [7] [5900/6250] eta: 0:01:35 lr: 0.000125 grad: 0.0889 (0.1174) loss: 0.8538 (0.8559) time: 0.2676 data: 0.0001 max mem: 26157 Train: [7] [6000/6250] eta: 0:01:07 lr: 0.000125 grad: 0.0953 (0.1173) loss: 0.8537 (0.8558) time: 0.2676 data: 0.0002 max mem: 26157 Train: [7] [6100/6250] eta: 0:00:40 lr: 0.000125 grad: 0.0933 (0.1170) loss: 0.8540 (0.8558) time: 0.2683 data: 0.0002 max mem: 26157 Train: [7] [6200/6250] eta: 0:00:13 lr: 0.000125 grad: 0.0834 (0.1166) loss: 0.8547 (0.8558) time: 0.2699 data: 0.0002 max mem: 26157 Train: [7] [6249/6250] eta: 0:00:00 lr: 0.000125 grad: 0.1022 (0.1165) loss: 0.8546 (0.8558) time: 0.2697 data: 0.0002 max mem: 26157 Train: [7] Total time: 0:28:23 (0.2725 s / it) Averaged stats: lr: 0.000125 grad: 0.1022 (0.1165) loss: 0.8546 (0.8558) Eval (hcp-train-subset): [7] [ 0/62] eta: 0:02:44 loss: 0.8668 (0.8668) time: 2.6568 data: 2.5544 max mem: 26157 Eval (hcp-train-subset): [7] [61/62] eta: 0:00:00 loss: 0.8580 (0.8589) time: 0.1385 data: 0.0559 max mem: 26157 Eval (hcp-train-subset): [7] Total time: 0:00:11 (0.1931 s / it) Averaged stats (hcp-train-subset): loss: 0.8580 (0.8589) Making plots (hcp-train-subset): example=25 Eval (hcp-val): [7] [ 0/62] eta: 0:03:37 loss: 0.8464 (0.8464) time: 3.5049 data: 3.3752 max mem: 26157 Eval (hcp-val): [7] [61/62] eta: 0:00:00 loss: 0.8523 (0.8533) time: 0.1248 data: 0.0405 max mem: 26157 Eval (hcp-val): [7] Total time: 0:00:11 (0.1911 s / it) Averaged stats (hcp-val): loss: 0.8523 (0.8533) Making plots (hcp-val): example=50 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [8] [ 0/6250] eta: 9:47:03 lr: 0.000125 grad: 0.1016 (0.1016) loss: 0.8817 (0.8817) time: 5.6358 data: 5.3595 max mem: 26157 Train: [8] [ 100/6250] eta: 0:32:57 lr: 0.000125 grad: 0.1063 (0.1091) loss: 0.8495 (0.8569) time: 0.2676 data: 0.0002 max mem: 26157 Train: [8] [ 200/6250] eta: 0:29:50 lr: 0.000125 grad: 0.0918 (0.1074) loss: 0.8511 (0.8543) time: 0.2705 data: 0.0002 max mem: 26157 Train: [8] [ 300/6250] eta: 0:28:30 lr: 0.000125 grad: 0.0973 (0.1056) loss: 0.8503 (0.8530) time: 0.2711 data: 0.0002 max mem: 26157 Train: [8] [ 400/6250] eta: 0:27:35 lr: 0.000125 grad: 0.0969 (0.1044) loss: 0.8468 (0.8525) time: 0.2694 data: 0.0002 max mem: 26157 Train: [8] [ 500/6250] eta: 0:26:51 lr: 0.000125 grad: 0.0994 (0.1052) loss: 0.8421 (0.8516) time: 0.2697 data: 0.0002 max mem: 26157 Train: [8] [ 600/6250] eta: 0:27:04 lr: 0.000125 grad: 0.0863 (0.1057) loss: 0.8443 (0.8512) time: 0.2687 data: 0.0002 max mem: 26157 Train: [8] [ 700/6250] eta: 0:26:21 lr: 0.000125 grad: 0.0930 (0.1041) loss: 0.8526 (0.8509) time: 0.2702 data: 0.0003 max mem: 26157 Train: [8] [ 800/6250] eta: 0:25:42 lr: 0.000125 grad: 0.0937 (0.1030) loss: 0.8497 (0.8509) time: 0.2684 data: 0.0002 max mem: 26157 Train: [8] [ 900/6250] eta: 0:25:05 lr: 0.000125 grad: 0.0968 (0.1019) loss: 0.8533 (0.8511) time: 0.2697 data: 0.0002 max mem: 26157 Train: [8] [1000/6250] eta: 0:24:30 lr: 0.000125 grad: 0.0889 (0.1020) loss: 0.8548 (0.8514) time: 0.2684 data: 0.0002 max mem: 26157 Train: [8] [1100/6250] eta: 0:23:57 lr: 0.000125 grad: 0.0904 (0.1013) loss: 0.8524 (0.8516) time: 0.2685 data: 0.0002 max mem: 26157 Train: [8] [1200/6250] eta: 0:23:24 lr: 0.000125 grad: 0.0887 (0.1011) loss: 0.8483 (0.8516) time: 0.2687 data: 0.0002 max mem: 26157 Train: [8] [1300/6250] eta: 0:22:53 lr: 0.000125 grad: 0.0855 (0.1006) loss: 0.8529 (0.8517) time: 0.2704 data: 0.0002 max mem: 26157 Train: [8] [1400/6250] eta: 0:22:23 lr: 0.000125 grad: 0.0909 (0.1006) loss: 0.8511 (0.8519) time: 0.2690 data: 0.0002 max mem: 26157 Train: [8] [1500/6250] eta: 0:21:53 lr: 0.000125 grad: 0.0884 (0.0999) loss: 0.8522 (0.8519) time: 0.2698 data: 0.0002 max mem: 26157 Train: [8] [1600/6250] eta: 0:21:23 lr: 0.000125 grad: 0.1031 (0.0997) loss: 0.8530 (0.8520) time: 0.2689 data: 0.0002 max mem: 26157 Train: [8] [1700/6250] eta: 0:20:54 lr: 0.000125 grad: 0.0983 (0.0994) loss: 0.8526 (0.8521) time: 0.2697 data: 0.0002 max mem: 26157 Train: [8] [1800/6250] eta: 0:20:26 lr: 0.000125 grad: 0.0968 (0.0997) loss: 0.8494 (0.8522) time: 0.2690 data: 0.0002 max mem: 26157 Train: [8] [1900/6250] eta: 0:19:57 lr: 0.000125 grad: 0.0943 (0.0994) loss: 0.8548 (0.8523) time: 0.2714 data: 0.0002 max mem: 26157 Train: [8] [2000/6250] eta: 0:19:31 lr: 0.000125 grad: 0.0885 (0.0991) loss: 0.8538 (0.8523) time: 0.3233 data: 0.0520 max mem: 26157 Train: [8] [2100/6250] eta: 0:19:03 lr: 0.000125 grad: 0.0811 (0.0989) loss: 0.8495 (0.8521) time: 0.2681 data: 0.0002 max mem: 26157 Train: [8] [2200/6250] eta: 0:18:34 lr: 0.000125 grad: 0.0939 (0.0987) loss: 0.8522 (0.8521) time: 0.2682 data: 0.0002 max mem: 26157 Train: [8] [2300/6250] eta: 0:18:06 lr: 0.000125 grad: 0.0955 (0.0986) loss: 0.8487 (0.8520) time: 0.2678 data: 0.0002 max mem: 26157 Train: [8] [2400/6250] eta: 0:17:37 lr: 0.000125 grad: 0.0910 (0.0985) loss: 0.8509 (0.8521) time: 0.2693 data: 0.0002 max mem: 26157 Train: [8] [2500/6250] eta: 0:17:09 lr: 0.000125 grad: 0.0917 (0.0982) loss: 0.8531 (0.8521) time: 0.2679 data: 0.0002 max mem: 26157 Train: [8] [2600/6250] eta: 0:16:41 lr: 0.000125 grad: 0.0847 (0.0979) loss: 0.8496 (0.8522) time: 0.2723 data: 0.0002 max mem: 26157 Train: [8] [2700/6250] eta: 0:16:13 lr: 0.000125 grad: 0.0891 (0.0979) loss: 0.8533 (0.8522) time: 0.2696 data: 0.0002 max mem: 26157 Train: [8] [2800/6250] eta: 0:15:45 lr: 0.000125 grad: 0.0942 (0.0978) loss: 0.8516 (0.8522) time: 0.2690 data: 0.0002 max mem: 26157 Train: [8] [2900/6250] eta: 0:15:17 lr: 0.000125 grad: 0.0865 (0.0976) loss: 0.8564 (0.8522) time: 0.2696 data: 0.0002 max mem: 26157 Train: [8] [3000/6250] eta: 0:14:49 lr: 0.000125 grad: 0.0785 (0.0973) loss: 0.8560 (0.8522) time: 0.2687 data: 0.0002 max mem: 26157 Train: [8] [3100/6250] eta: 0:14:21 lr: 0.000125 grad: 0.0966 (0.0972) loss: 0.8536 (0.8522) time: 0.2698 data: 0.0002 max mem: 26157 Train: [8] [3200/6250] eta: 0:13:54 lr: 0.000125 grad: 0.0877 (0.0970) loss: 0.8562 (0.8521) time: 0.2730 data: 0.0002 max mem: 26157 Train: [8] [3300/6250] eta: 0:13:26 lr: 0.000125 grad: 0.0907 (0.0968) loss: 0.8539 (0.8521) time: 0.2685 data: 0.0002 max mem: 26157 Train: [8] [3400/6250] eta: 0:12:58 lr: 0.000125 grad: 0.0876 (0.0968) loss: 0.8506 (0.8521) time: 0.2710 data: 0.0002 max mem: 26157 Train: [8] [3500/6250] eta: 0:12:31 lr: 0.000125 grad: 0.0831 (0.0965) loss: 0.8569 (0.8521) time: 0.2670 data: 0.0002 max mem: 26157 Train: [8] [3600/6250] eta: 0:12:05 lr: 0.000125 grad: 0.0848 (0.0963) loss: 0.8503 (0.8521) time: 0.2705 data: 0.0002 max mem: 26157 Train: [8] [3700/6250] eta: 0:11:37 lr: 0.000125 grad: 0.0860 (0.0961) loss: 0.8522 (0.8521) time: 0.2677 data: 0.0002 max mem: 26157 Train: [8] [3800/6250] eta: 0:11:09 lr: 0.000125 grad: 0.0812 (0.0958) loss: 0.8552 (0.8522) time: 0.2689 data: 0.0002 max mem: 26157 Train: [8] [3900/6250] eta: 0:10:42 lr: 0.000125 grad: 0.0837 (0.0957) loss: 0.8538 (0.8521) time: 0.2694 data: 0.0002 max mem: 26157 Train: [8] [4000/6250] eta: 0:10:14 lr: 0.000125 grad: 0.0909 (0.0955) loss: 0.8517 (0.8521) time: 0.2700 data: 0.0002 max mem: 26157 Train: [8] [4100/6250] eta: 0:09:47 lr: 0.000125 grad: 0.0815 (0.0953) loss: 0.8528 (0.8521) time: 0.2683 data: 0.0001 max mem: 26157 Train: [8] [4200/6250] eta: 0:09:19 lr: 0.000125 grad: 0.0815 (0.0951) loss: 0.8515 (0.8521) time: 0.2695 data: 0.0002 max mem: 26157 Train: [8] [4300/6250] eta: 0:08:52 lr: 0.000125 grad: 0.0796 (0.0949) loss: 0.8568 (0.8521) time: 0.2683 data: 0.0002 max mem: 26157 Train: [8] [4400/6250] eta: 0:08:24 lr: 0.000125 grad: 0.0853 (0.0947) loss: 0.8516 (0.8521) time: 0.2702 data: 0.0002 max mem: 26157 Train: [8] [4500/6250] eta: 0:07:57 lr: 0.000125 grad: 0.0892 (0.0946) loss: 0.8523 (0.8521) time: 0.2697 data: 0.0002 max mem: 26157 Train: [8] [4600/6250] eta: 0:07:30 lr: 0.000125 grad: 0.0772 (0.0944) loss: 0.8502 (0.8521) time: 0.2696 data: 0.0002 max mem: 26157 Train: [8] [4700/6250] eta: 0:07:02 lr: 0.000125 grad: 0.0814 (0.0943) loss: 0.8498 (0.8521) time: 0.2683 data: 0.0002 max mem: 26157 Train: [8] [4800/6250] eta: 0:06:35 lr: 0.000125 grad: 0.0889 (0.0942) loss: 0.8481 (0.8521) time: 0.2681 data: 0.0002 max mem: 26157 Train: [8] [4900/6250] eta: 0:06:07 lr: 0.000125 grad: 0.0783 (0.0940) loss: 0.8513 (0.8521) time: 0.2679 data: 0.0001 max mem: 26157 Train: [8] [5000/6250] eta: 0:05:40 lr: 0.000125 grad: 0.0850 (0.0939) loss: 0.8500 (0.8521) time: 0.2694 data: 0.0002 max mem: 26157 Train: [8] [5100/6250] eta: 0:05:13 lr: 0.000125 grad: 0.0883 (0.0940) loss: 0.8537 (0.8520) time: 0.2680 data: 0.0002 max mem: 26157 Train: [8] [5200/6250] eta: 0:04:45 lr: 0.000124 grad: 0.0835 (0.0938) loss: 0.8455 (0.8520) time: 0.2699 data: 0.0002 max mem: 26157 Train: [8] [5300/6250] eta: 0:04:18 lr: 0.000124 grad: 0.0911 (0.0936) loss: 0.8488 (0.8520) time: 0.2685 data: 0.0002 max mem: 26157 Train: [8] [5400/6250] eta: 0:03:51 lr: 0.000124 grad: 0.0891 (0.0936) loss: 0.8511 (0.8520) time: 0.2708 data: 0.0002 max mem: 26157 Train: [8] [5500/6250] eta: 0:03:24 lr: 0.000124 grad: 0.0826 (0.0935) loss: 0.8474 (0.8519) time: 0.2695 data: 0.0002 max mem: 26157 Train: [8] [5600/6250] eta: 0:02:56 lr: 0.000124 grad: 0.0823 (0.0934) loss: 0.8501 (0.8519) time: 0.2730 data: 0.0002 max mem: 26157 Train: [8] [5700/6250] eta: 0:02:29 lr: 0.000124 grad: 0.0837 (0.0932) loss: 0.8482 (0.8518) time: 0.2694 data: 0.0002 max mem: 26157 Train: [8] [5800/6250] eta: 0:02:02 lr: 0.000124 grad: 0.0812 (0.0931) loss: 0.8476 (0.8518) time: 0.2702 data: 0.0002 max mem: 26157 Train: [8] [5900/6250] eta: 0:01:35 lr: 0.000124 grad: 0.0878 (0.0930) loss: 0.8524 (0.8518) time: 0.2685 data: 0.0002 max mem: 26157 Train: [8] [6000/6250] eta: 0:01:08 lr: 0.000124 grad: 0.0788 (0.0928) loss: 0.8505 (0.8518) time: 0.2690 data: 0.0002 max mem: 26157 Train: [8] [6100/6250] eta: 0:00:40 lr: 0.000124 grad: 0.0844 (0.0927) loss: 0.8497 (0.8517) time: 0.2708 data: 0.0002 max mem: 26157 Train: [8] [6200/6250] eta: 0:00:13 lr: 0.000124 grad: 0.0781 (0.0926) loss: 0.8506 (0.8517) time: 0.2696 data: 0.0002 max mem: 26157 Train: [8] [6249/6250] eta: 0:00:00 lr: 0.000124 grad: 0.0810 (0.0925) loss: 0.8461 (0.8516) time: 0.2705 data: 0.0002 max mem: 26157 Train: [8] Total time: 0:28:26 (0.2730 s / it) Averaged stats: lr: 0.000124 grad: 0.0810 (0.0925) loss: 0.8461 (0.8516) Eval (hcp-train-subset): [8] [ 0/62] eta: 0:03:58 loss: 0.8651 (0.8651) time: 3.8543 data: 3.7250 max mem: 26157 Eval (hcp-train-subset): [8] [61/62] eta: 0:00:00 loss: 0.8499 (0.8528) time: 0.1403 data: 0.0576 max mem: 26157 Eval (hcp-train-subset): [8] Total time: 0:00:13 (0.2171 s / it) Averaged stats (hcp-train-subset): loss: 0.8499 (0.8528) Making plots (hcp-train-subset): example=8 Eval (hcp-val): [8] [ 0/62] eta: 0:04:43 loss: 0.8422 (0.8422) time: 4.5720 data: 4.4882 max mem: 26157 Eval (hcp-val): [8] [61/62] eta: 0:00:00 loss: 0.8459 (0.8472) time: 0.1243 data: 0.0414 max mem: 26157 Eval (hcp-val): [8] Total time: 0:00:12 (0.2011 s / it) Averaged stats (hcp-val): loss: 0.8459 (0.8472) Making plots (hcp-val): example=36 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [9] [ 0/6250] eta: 9:18:35 lr: 0.000124 grad: 0.0533 (0.0533) loss: 0.8792 (0.8792) time: 5.3624 data: 5.0854 max mem: 26157 Train: [9] [ 100/6250] eta: 0:33:54 lr: 0.000124 grad: 0.0910 (0.1011) loss: 0.8466 (0.8490) time: 0.2707 data: 0.0002 max mem: 26157 Train: [9] [ 200/6250] eta: 0:30:26 lr: 0.000124 grad: 0.0852 (0.0963) loss: 0.8493 (0.8492) time: 0.2735 data: 0.0002 max mem: 26157 Train: [9] [ 300/6250] eta: 0:28:59 lr: 0.000124 grad: 0.0769 (0.0945) loss: 0.8445 (0.8489) time: 0.2765 data: 0.0002 max mem: 26157 Train: [9] [ 400/6250] eta: 0:28:00 lr: 0.000124 grad: 0.0858 (0.0940) loss: 0.8493 (0.8486) time: 0.2726 data: 0.0002 max mem: 26157 Train: [9] [ 500/6250] eta: 0:27:14 lr: 0.000124 grad: 0.0905 (0.0934) loss: 0.8407 (0.8481) time: 0.2708 data: 0.0002 max mem: 26157 Train: [9] [ 600/6250] eta: 0:27:17 lr: 0.000124 grad: 0.0810 (0.0925) loss: 0.8456 (0.8481) time: 0.2760 data: 0.0002 max mem: 26157 Train: [9] [ 700/6250] eta: 0:27:12 lr: 0.000124 grad: 0.0900 (0.0920) loss: 0.8449 (0.8478) time: 0.2747 data: 0.0002 max mem: 26157 Train: [9] [ 800/6250] eta: 0:26:28 lr: 0.000124 grad: 0.0826 (0.0916) loss: 0.8431 (0.8475) time: 0.2683 data: 0.0002 max mem: 26157 Train: [9] [ 900/6250] eta: 0:25:50 lr: 0.000124 grad: 0.0830 (0.0910) loss: 0.8460 (0.8475) time: 0.2719 data: 0.0003 max mem: 26157 Train: [9] [1000/6250] eta: 0:25:10 lr: 0.000124 grad: 0.0808 (0.0903) loss: 0.8521 (0.8476) time: 0.2689 data: 0.0002 max mem: 26157 Train: [9] [1100/6250] eta: 0:24:33 lr: 0.000124 grad: 0.0830 (0.0898) loss: 0.8469 (0.8476) time: 0.2690 data: 0.0002 max mem: 26157 Train: [9] [1200/6250] eta: 0:23:57 lr: 0.000124 grad: 0.0803 (0.0893) loss: 0.8511 (0.8475) time: 0.2696 data: 0.0002 max mem: 26157 Train: [9] [1300/6250] eta: 0:23:46 lr: 0.000124 grad: 0.0838 (0.0893) loss: 0.8491 (0.8477) time: 0.5207 data: 0.2471 max mem: 26157 Train: [9] [1400/6250] eta: 0:23:11 lr: 0.000124 grad: 0.0767 (0.0886) loss: 0.8513 (0.8478) time: 0.2719 data: 0.0002 max mem: 26157 Train: [9] [1500/6250] eta: 0:22:37 lr: 0.000124 grad: 0.0817 (0.0885) loss: 0.8423 (0.8478) time: 0.2685 data: 0.0002 max mem: 26157 Train: [9] [1600/6250] eta: 0:22:03 lr: 0.000124 grad: 0.0779 (0.0880) loss: 0.8464 (0.8478) time: 0.2692 data: 0.0002 max mem: 26157 Train: [9] [1700/6250] eta: 0:21:30 lr: 0.000124 grad: 0.0765 (0.0878) loss: 0.8473 (0.8478) time: 0.2687 data: 0.0002 max mem: 26157 Train: [9] [1800/6250] eta: 0:20:59 lr: 0.000124 grad: 0.0792 (0.0876) loss: 0.8474 (0.8479) time: 0.2693 data: 0.0002 max mem: 26157 Train: [9] [1900/6250] eta: 0:20:27 lr: 0.000124 grad: 0.0809 (0.0873) loss: 0.8510 (0.8478) time: 0.2698 data: 0.0002 max mem: 26157 Train: [9] [2000/6250] eta: 0:19:56 lr: 0.000124 grad: 0.0840 (0.0872) loss: 0.8522 (0.8479) time: 0.2699 data: 0.0002 max mem: 26157 Train: [9] [2100/6250] eta: 0:19:26 lr: 0.000124 grad: 0.0888 (0.0870) loss: 0.8470 (0.8479) time: 0.2690 data: 0.0002 max mem: 26157 Train: [9] [2200/6250] eta: 0:18:56 lr: 0.000124 grad: 0.0799 (0.0870) loss: 0.8465 (0.8478) time: 0.2694 data: 0.0002 max mem: 26157 Train: [9] [2300/6250] eta: 0:18:25 lr: 0.000124 grad: 0.0827 (0.0867) loss: 0.8450 (0.8478) time: 0.2699 data: 0.0002 max mem: 26157 Train: [9] [2400/6250] eta: 0:17:56 lr: 0.000124 grad: 0.0861 (0.0866) loss: 0.8443 (0.8477) time: 0.2686 data: 0.0002 max mem: 26157 Train: [9] [2500/6250] eta: 0:17:26 lr: 0.000124 grad: 0.0832 (0.0866) loss: 0.8454 (0.8476) time: 0.2693 data: 0.0002 max mem: 26157 Train: [9] [2600/6250] eta: 0:16:57 lr: 0.000124 grad: 0.0828 (0.0865) loss: 0.8527 (0.8476) time: 0.2680 data: 0.0002 max mem: 26157 Train: [9] [2700/6250] eta: 0:16:28 lr: 0.000124 grad: 0.0805 (0.0865) loss: 0.8483 (0.8475) time: 0.2693 data: 0.0002 max mem: 26157 Train: [9] [2800/6250] eta: 0:15:59 lr: 0.000124 grad: 0.0816 (0.0865) loss: 0.8439 (0.8473) time: 0.2678 data: 0.0002 max mem: 26157 Train: [9] [2900/6250] eta: 0:15:30 lr: 0.000124 grad: 0.0846 (0.0865) loss: 0.8451 (0.8472) time: 0.2684 data: 0.0002 max mem: 26157 Train: [9] [3000/6250] eta: 0:15:01 lr: 0.000124 grad: 0.0926 (0.0865) loss: 0.8399 (0.8470) time: 0.2685 data: 0.0002 max mem: 26157 Train: [9] [3100/6250] eta: 0:14:32 lr: 0.000124 grad: 0.0836 (0.0869) loss: 0.8442 (0.8469) time: 0.2705 data: 0.0002 max mem: 26157 Train: [9] [3200/6250] eta: 0:14:04 lr: 0.000124 grad: 0.0817 (0.0869) loss: 0.8433 (0.8468) time: 0.2692 data: 0.0002 max mem: 26157 Train: [9] [3300/6250] eta: 0:13:36 lr: 0.000124 grad: 0.0910 (0.0869) loss: 0.8429 (0.8467) time: 0.2689 data: 0.0001 max mem: 26157 Train: [9] [3400/6250] eta: 0:13:08 lr: 0.000124 grad: 0.0795 (0.0868) loss: 0.8399 (0.8465) time: 0.2703 data: 0.0002 max mem: 26157 Train: [9] [3500/6250] eta: 0:12:39 lr: 0.000124 grad: 0.0773 (0.0867) loss: 0.8410 (0.8464) time: 0.2682 data: 0.0002 max mem: 26157 Train: [9] [3600/6250] eta: 0:12:11 lr: 0.000124 grad: 0.0810 (0.0866) loss: 0.8464 (0.8464) time: 0.2719 data: 0.0003 max mem: 26157 Train: [9] [3700/6250] eta: 0:11:43 lr: 0.000124 grad: 0.0821 (0.0866) loss: 0.8441 (0.8463) time: 0.2697 data: 0.0002 max mem: 26157 Train: [9] [3800/6250] eta: 0:11:15 lr: 0.000124 grad: 0.0851 (0.0864) loss: 0.8445 (0.8463) time: 0.2695 data: 0.0002 max mem: 26157 Train: [9] [3900/6250] eta: 0:10:48 lr: 0.000124 grad: 0.0820 (0.0863) loss: 0.8470 (0.8463) time: 0.2712 data: 0.0002 max mem: 26157 Train: [9] [4000/6250] eta: 0:10:20 lr: 0.000124 grad: 0.0873 (0.0862) loss: 0.8427 (0.8462) time: 0.2699 data: 0.0002 max mem: 26157 Train: [9] [4100/6250] eta: 0:09:52 lr: 0.000124 grad: 0.0732 (0.0861) loss: 0.8472 (0.8463) time: 0.2702 data: 0.0002 max mem: 26157 Train: [9] [4200/6250] eta: 0:09:24 lr: 0.000124 grad: 0.0738 (0.0860) loss: 0.8489 (0.8463) time: 0.2724 data: 0.0003 max mem: 26157 Train: [9] [4300/6250] eta: 0:08:56 lr: 0.000124 grad: 0.0743 (0.0858) loss: 0.8451 (0.8463) time: 0.2713 data: 0.0002 max mem: 26157 Train: [9] [4400/6250] eta: 0:08:29 lr: 0.000124 grad: 0.0839 (0.0857) loss: 0.8499 (0.8464) time: 0.2702 data: 0.0002 max mem: 26157 Train: [9] [4500/6250] eta: 0:08:01 lr: 0.000124 grad: 0.0756 (0.0856) loss: 0.8449 (0.8464) time: 0.2693 data: 0.0002 max mem: 26157 Train: [9] [4600/6250] eta: 0:07:33 lr: 0.000124 grad: 0.0837 (0.0855) loss: 0.8449 (0.8463) time: 0.2737 data: 0.0002 max mem: 26157 Train: [9] [4700/6250] eta: 0:07:06 lr: 0.000124 grad: 0.0777 (0.0854) loss: 0.8498 (0.8463) time: 0.2722 data: 0.0002 max mem: 26157 Train: [9] [4800/6250] eta: 0:06:38 lr: 0.000124 grad: 0.0804 (0.0853) loss: 0.8449 (0.8463) time: 0.2694 data: 0.0002 max mem: 26157 Train: [9] [4900/6250] eta: 0:06:10 lr: 0.000124 grad: 0.0783 (0.0853) loss: 0.8446 (0.8463) time: 0.2705 data: 0.0002 max mem: 26157 Train: [9] [5000/6250] eta: 0:05:43 lr: 0.000124 grad: 0.0831 (0.0851) loss: 0.8449 (0.8463) time: 0.2721 data: 0.0002 max mem: 26157 Train: [9] [5100/6250] eta: 0:05:15 lr: 0.000124 grad: 0.0730 (0.0850) loss: 0.8468 (0.8463) time: 0.2695 data: 0.0002 max mem: 26157 Train: [9] [5200/6250] eta: 0:04:48 lr: 0.000124 grad: 0.0751 (0.0849) loss: 0.8497 (0.8464) time: 0.2681 data: 0.0002 max mem: 26157 Train: [9] [5300/6250] eta: 0:04:20 lr: 0.000124 grad: 0.0749 (0.0848) loss: 0.8496 (0.8464) time: 0.2687 data: 0.0002 max mem: 26157 Train: [9] [5400/6250] eta: 0:03:53 lr: 0.000124 grad: 0.0767 (0.0846) loss: 0.8477 (0.8465) time: 0.2748 data: 0.0002 max mem: 26157 Train: [9] [5500/6250] eta: 0:03:25 lr: 0.000124 grad: 0.0771 (0.0845) loss: 0.8453 (0.8465) time: 0.2772 data: 0.0003 max mem: 26157 Train: [9] [5600/6250] eta: 0:02:58 lr: 0.000124 grad: 0.0822 (0.0844) loss: 0.8478 (0.8465) time: 0.2688 data: 0.0002 max mem: 26157 Train: [9] [5700/6250] eta: 0:02:31 lr: 0.000124 grad: 0.0759 (0.0843) loss: 0.8441 (0.8465) time: 0.2718 data: 0.0002 max mem: 26157 Train: [9] [5800/6250] eta: 0:02:03 lr: 0.000124 grad: 0.0836 (0.0842) loss: 0.8444 (0.8465) time: 0.2692 data: 0.0002 max mem: 26157 Train: [9] [5900/6250] eta: 0:01:36 lr: 0.000124 grad: 0.0726 (0.0841) loss: 0.8543 (0.8465) time: 0.2706 data: 0.0002 max mem: 26157 Train: [9] [6000/6250] eta: 0:01:08 lr: 0.000124 grad: 0.0784 (0.0840) loss: 0.8402 (0.8465) time: 0.2697 data: 0.0002 max mem: 26157 Train: [9] [6100/6250] eta: 0:00:41 lr: 0.000124 grad: 0.0823 (0.0840) loss: 0.8481 (0.8465) time: 0.2689 data: 0.0002 max mem: 26157 Train: [9] [6200/6250] eta: 0:00:13 lr: 0.000124 grad: 0.0779 (0.0838) loss: 0.8415 (0.8465) time: 0.2700 data: 0.0002 max mem: 26157 Train: [9] [6249/6250] eta: 0:00:00 lr: 0.000124 grad: 0.0783 (0.0838) loss: 0.8489 (0.8465) time: 0.2704 data: 0.0002 max mem: 26157 Train: [9] Total time: 0:28:44 (0.2760 s / it) Averaged stats: lr: 0.000124 grad: 0.0783 (0.0838) loss: 0.8489 (0.8465) Eval (hcp-train-subset): [9] [ 0/62] eta: 0:03:08 loss: 0.8631 (0.8631) time: 3.0364 data: 2.9128 max mem: 26157 Eval (hcp-train-subset): [9] [61/62] eta: 0:00:00 loss: 0.8498 (0.8505) time: 0.1325 data: 0.0481 max mem: 26157 Eval (hcp-train-subset): [9] Total time: 0:00:13 (0.2126 s / it) Averaged stats (hcp-train-subset): loss: 0.8498 (0.8505) Making plots (hcp-train-subset): example=49 Eval (hcp-val): [9] [ 0/62] eta: 0:05:21 loss: 0.8425 (0.8425) time: 5.1816 data: 5.0973 max mem: 26157 Eval (hcp-val): [9] [61/62] eta: 0:00:00 loss: 0.8444 (0.8455) time: 0.1326 data: 0.0478 max mem: 26157 Eval (hcp-val): [9] Total time: 0:00:12 (0.2033 s / it) Averaged stats (hcp-val): loss: 0.8444 (0.8455) Making plots (hcp-val): example=52 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [10] [ 0/6250] eta: 10:59:21 lr: 0.000124 grad: 0.0744 (0.0744) loss: 0.8397 (0.8397) time: 6.3299 data: 6.0546 max mem: 26157 Train: [10] [ 100/6250] eta: 0:33:45 lr: 0.000124 grad: 0.0796 (0.0985) loss: 0.8320 (0.8381) time: 0.2680 data: 0.0002 max mem: 26157 Train: [10] [ 200/6250] eta: 0:30:11 lr: 0.000124 grad: 0.0818 (0.0937) loss: 0.8378 (0.8379) time: 0.2686 data: 0.0002 max mem: 26157 Train: [10] [ 300/6250] eta: 0:28:45 lr: 0.000124 grad: 0.0773 (0.0901) loss: 0.8346 (0.8375) time: 0.2722 data: 0.0002 max mem: 26157 Train: [10] [ 400/6250] eta: 0:27:49 lr: 0.000124 grad: 0.0803 (0.0902) loss: 0.8336 (0.8379) time: 0.2710 data: 0.0002 max mem: 26157 Train: [10] [ 500/6250] eta: 0:27:05 lr: 0.000124 grad: 0.0766 (0.0882) loss: 0.8392 (0.8381) time: 0.2714 data: 0.0002 max mem: 26157 Train: [10] [ 600/6250] eta: 0:26:23 lr: 0.000124 grad: 0.0754 (0.0868) loss: 0.8457 (0.8389) time: 0.2693 data: 0.0002 max mem: 26157 Train: [10] [ 700/6250] eta: 0:25:47 lr: 0.000124 grad: 0.0705 (0.0856) loss: 0.8410 (0.8395) time: 0.2685 data: 0.0002 max mem: 26157 Train: [10] [ 800/6250] eta: 0:25:13 lr: 0.000124 grad: 0.0720 (0.0843) loss: 0.8484 (0.8403) time: 0.2702 data: 0.0002 max mem: 26157 Train: [10] [ 900/6250] eta: 0:24:42 lr: 0.000124 grad: 0.0740 (0.0833) loss: 0.8457 (0.8409) time: 0.2708 data: 0.0002 max mem: 26157 Train: [10] [1000/6250] eta: 0:24:11 lr: 0.000124 grad: 0.0708 (0.0823) loss: 0.8454 (0.8415) time: 0.2685 data: 0.0003 max mem: 26157 Train: [10] [1100/6250] eta: 0:23:40 lr: 0.000124 grad: 0.0739 (0.0817) loss: 0.8432 (0.8418) time: 0.2698 data: 0.0002 max mem: 26157 Train: [10] [1200/6250] eta: 0:23:10 lr: 0.000124 grad: 0.0823 (0.0816) loss: 0.8438 (0.8421) time: 0.2705 data: 0.0002 max mem: 26157 Train: [10] [1300/6250] eta: 0:22:40 lr: 0.000124 grad: 0.0722 (0.0813) loss: 0.8455 (0.8423) time: 0.2691 data: 0.0002 max mem: 26157 Train: [10] [1400/6250] eta: 0:22:10 lr: 0.000124 grad: 0.0775 (0.0810) loss: 0.8449 (0.8425) time: 0.2681 data: 0.0002 max mem: 26157 Train: [10] [1500/6250] eta: 0:21:41 lr: 0.000124 grad: 0.0735 (0.0807) loss: 0.8452 (0.8426) time: 0.2699 data: 0.0002 max mem: 26157 Train: [10] [1600/6250] eta: 0:21:13 lr: 0.000124 grad: 0.0772 (0.0806) loss: 0.8433 (0.8427) time: 0.2694 data: 0.0002 max mem: 26157 Train: [10] [1700/6250] eta: 0:20:48 lr: 0.000124 grad: 0.0790 (0.0804) loss: 0.8421 (0.8428) time: 0.2704 data: 0.0002 max mem: 26157 Train: [10] [1800/6250] eta: 0:20:19 lr: 0.000124 grad: 0.0768 (0.0805) loss: 0.8411 (0.8427) time: 0.2683 data: 0.0002 max mem: 26157 Train: [10] [1900/6250] eta: 0:19:51 lr: 0.000124 grad: 0.0721 (0.0806) loss: 0.8444 (0.8428) time: 0.2684 data: 0.0002 max mem: 26157 Train: [10] [2000/6250] eta: 0:19:22 lr: 0.000124 grad: 0.0730 (0.0805) loss: 0.8406 (0.8428) time: 0.2680 data: 0.0002 max mem: 26157 Train: [10] [2100/6250] eta: 0:18:54 lr: 0.000124 grad: 0.0720 (0.0803) loss: 0.8459 (0.8429) time: 0.2686 data: 0.0002 max mem: 26157 Train: [10] [2200/6250] eta: 0:18:27 lr: 0.000124 grad: 0.0830 (0.0803) loss: 0.8448 (0.8429) time: 0.2689 data: 0.0002 max mem: 26157 Train: [10] [2300/6250] eta: 0:17:59 lr: 0.000124 grad: 0.0750 (0.0802) loss: 0.8383 (0.8428) time: 0.2696 data: 0.0002 max mem: 26157 Train: [10] [2400/6250] eta: 0:17:31 lr: 0.000124 grad: 0.0763 (0.0802) loss: 0.8411 (0.8429) time: 0.2685 data: 0.0002 max mem: 26157 Train: [10] [2500/6250] eta: 0:17:03 lr: 0.000124 grad: 0.0785 (0.0802) loss: 0.8436 (0.8429) time: 0.2689 data: 0.0002 max mem: 26157 Train: [10] [2600/6250] eta: 0:16:36 lr: 0.000124 grad: 0.0719 (0.0801) loss: 0.8412 (0.8430) time: 0.2686 data: 0.0002 max mem: 26157 Train: [10] [2700/6250] eta: 0:16:15 lr: 0.000124 grad: 0.0731 (0.0800) loss: 0.8446 (0.8430) time: 0.2682 data: 0.0002 max mem: 26157 Train: [10] [2800/6250] eta: 0:15:47 lr: 0.000124 grad: 0.0756 (0.0798) loss: 0.8456 (0.8431) time: 0.2690 data: 0.0002 max mem: 26157 Train: [10] [2900/6250] eta: 0:15:19 lr: 0.000124 grad: 0.0796 (0.0798) loss: 0.8398 (0.8430) time: 0.2710 data: 0.0002 max mem: 26157 Train: [10] [3000/6250] eta: 0:14:51 lr: 0.000124 grad: 0.0756 (0.0796) loss: 0.8403 (0.8429) time: 0.2684 data: 0.0002 max mem: 26157 Train: [10] [3100/6250] eta: 0:14:23 lr: 0.000124 grad: 0.0763 (0.0796) loss: 0.8404 (0.8429) time: 0.2711 data: 0.0002 max mem: 26157 Train: [10] [3200/6250] eta: 0:13:55 lr: 0.000124 grad: 0.0744 (0.0795) loss: 0.8396 (0.8429) time: 0.2688 data: 0.0002 max mem: 26157 Train: [10] [3300/6250] eta: 0:13:27 lr: 0.000124 grad: 0.0745 (0.0796) loss: 0.8356 (0.8428) time: 0.2684 data: 0.0002 max mem: 26157 Train: [10] [3400/6250] eta: 0:12:59 lr: 0.000124 grad: 0.0747 (0.0795) loss: 0.8414 (0.8428) time: 0.2683 data: 0.0002 max mem: 26157 Train: [10] [3500/6250] eta: 0:12:32 lr: 0.000124 grad: 0.0773 (0.0794) loss: 0.8425 (0.8428) time: 0.2709 data: 0.0002 max mem: 26157 Train: [10] [3600/6250] eta: 0:12:04 lr: 0.000124 grad: 0.0743 (0.0793) loss: 0.8472 (0.8428) time: 0.2686 data: 0.0002 max mem: 26157 Train: [10] [3700/6250] eta: 0:11:36 lr: 0.000124 grad: 0.0722 (0.0793) loss: 0.8414 (0.8428) time: 0.2699 data: 0.0002 max mem: 26157 Train: [10] [3800/6250] eta: 0:11:09 lr: 0.000124 grad: 0.0788 (0.0793) loss: 0.8455 (0.8428) time: 0.2671 data: 0.0002 max mem: 26157 Train: [10] [3900/6250] eta: 0:10:41 lr: 0.000124 grad: 0.0718 (0.0792) loss: 0.8436 (0.8428) time: 0.2695 data: 0.0002 max mem: 26157 Train: [10] [4000/6250] eta: 0:10:14 lr: 0.000124 grad: 0.0737 (0.0791) loss: 0.8385 (0.8428) time: 0.2683 data: 0.0002 max mem: 26157 Train: [10] [4100/6250] eta: 0:09:46 lr: 0.000124 grad: 0.0756 (0.0791) loss: 0.8455 (0.8428) time: 0.2706 data: 0.0002 max mem: 26157 Train: [10] [4200/6250] eta: 0:09:19 lr: 0.000124 grad: 0.0725 (0.0790) loss: 0.8429 (0.8428) time: 0.2681 data: 0.0002 max mem: 26157 Train: [10] [4300/6250] eta: 0:08:51 lr: 0.000124 grad: 0.0735 (0.0789) loss: 0.8451 (0.8428) time: 0.2697 data: 0.0002 max mem: 26157 Train: [10] [4400/6250] eta: 0:08:24 lr: 0.000124 grad: 0.0721 (0.0790) loss: 0.8436 (0.8428) time: 0.2696 data: 0.0002 max mem: 26157 Train: [10] [4500/6250] eta: 0:07:57 lr: 0.000124 grad: 0.0738 (0.0789) loss: 0.8433 (0.8428) time: 0.2685 data: 0.0002 max mem: 26157 Train: [10] [4600/6250] eta: 0:07:29 lr: 0.000124 grad: 0.0739 (0.0789) loss: 0.8451 (0.8428) time: 0.2705 data: 0.0002 max mem: 26157 Train: [10] [4700/6250] eta: 0:07:02 lr: 0.000124 grad: 0.0725 (0.0787) loss: 0.8415 (0.8428) time: 0.2710 data: 0.0002 max mem: 26157 Train: [10] [4800/6250] eta: 0:06:35 lr: 0.000124 grad: 0.0719 (0.0787) loss: 0.8390 (0.8428) time: 0.2716 data: 0.0002 max mem: 26157 Train: [10] [4900/6250] eta: 0:06:07 lr: 0.000124 grad: 0.0726 (0.0786) loss: 0.8436 (0.8427) time: 0.2695 data: 0.0002 max mem: 26157 Train: [10] [5000/6250] eta: 0:05:40 lr: 0.000124 grad: 0.0697 (0.0785) loss: 0.8475 (0.8427) time: 0.2693 data: 0.0002 max mem: 26157 Train: [10] [5100/6250] eta: 0:05:13 lr: 0.000124 grad: 0.0742 (0.0784) loss: 0.8414 (0.8427) time: 0.2705 data: 0.0002 max mem: 26157 Train: [10] [5200/6250] eta: 0:04:45 lr: 0.000124 grad: 0.0752 (0.0784) loss: 0.8439 (0.8426) time: 0.2710 data: 0.0002 max mem: 26157 Train: [10] [5300/6250] eta: 0:04:18 lr: 0.000124 grad: 0.0774 (0.0784) loss: 0.8416 (0.8426) time: 0.2693 data: 0.0002 max mem: 26157 Train: [10] [5400/6250] eta: 0:03:51 lr: 0.000124 grad: 0.0767 (0.0784) loss: 0.8410 (0.8425) time: 0.2681 data: 0.0002 max mem: 26157 Train: [10] [5500/6250] eta: 0:03:24 lr: 0.000124 grad: 0.0753 (0.0783) loss: 0.8362 (0.8425) time: 0.2691 data: 0.0001 max mem: 26157 Train: [10] [5600/6250] eta: 0:02:56 lr: 0.000124 grad: 0.0729 (0.0783) loss: 0.8440 (0.8425) time: 0.2709 data: 0.0002 max mem: 26157 Train: [10] [5700/6250] eta: 0:02:29 lr: 0.000124 grad: 0.0683 (0.0782) loss: 0.8457 (0.8425) time: 0.2688 data: 0.0002 max mem: 26157 Train: [10] [5800/6250] eta: 0:02:02 lr: 0.000124 grad: 0.0729 (0.0781) loss: 0.8470 (0.8425) time: 0.2698 data: 0.0002 max mem: 26157 Train: [10] [5900/6250] eta: 0:01:35 lr: 0.000124 grad: 0.0720 (0.0781) loss: 0.8416 (0.8424) time: 0.2690 data: 0.0002 max mem: 26157 Train: [10] [6000/6250] eta: 0:01:08 lr: 0.000124 grad: 0.0764 (0.0780) loss: 0.8415 (0.8424) time: 0.2683 data: 0.0002 max mem: 26157 Train: [10] [6100/6250] eta: 0:00:40 lr: 0.000124 grad: 0.0710 (0.0780) loss: 0.8424 (0.8424) time: 0.2706 data: 0.0002 max mem: 26157 Train: [10] [6200/6250] eta: 0:00:13 lr: 0.000124 grad: 0.0733 (0.0779) loss: 0.8408 (0.8424) time: 0.2695 data: 0.0002 max mem: 26157 Train: [10] [6249/6250] eta: 0:00:00 lr: 0.000124 grad: 0.0749 (0.0779) loss: 0.8438 (0.8424) time: 0.2689 data: 0.0002 max mem: 26157 Train: [10] Total time: 0:28:31 (0.2738 s / it) Averaged stats: lr: 0.000124 grad: 0.0749 (0.0779) loss: 0.8438 (0.8424) Eval (hcp-train-subset): [10] [ 0/62] eta: 0:04:47 loss: 0.8566 (0.8566) time: 4.6313 data: 4.5475 max mem: 26157 Eval (hcp-train-subset): [10] [61/62] eta: 0:00:00 loss: 0.8471 (0.8477) time: 0.1332 data: 0.0487 max mem: 26157 Eval (hcp-train-subset): [10] Total time: 0:00:13 (0.2157 s / it) Averaged stats (hcp-train-subset): loss: 0.8471 (0.8477) Making plots (hcp-train-subset): example=58 Eval (hcp-val): [10] [ 0/62] eta: 0:03:54 loss: 0.8390 (0.8390) time: 3.7797 data: 3.6541 max mem: 26157 Eval (hcp-val): [10] [61/62] eta: 0:00:00 loss: 0.8417 (0.8426) time: 0.1297 data: 0.0452 max mem: 26157 Eval (hcp-val): [10] Total time: 0:00:14 (0.2368 s / it) Averaged stats (hcp-val): loss: 0.8417 (0.8426) Making plots (hcp-val): example=55 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [11] [ 0/6250] eta: 10:57:05 lr: 0.000124 grad: 0.0578 (0.0578) loss: 0.8646 (0.8646) time: 6.3081 data: 5.9466 max mem: 26157 Train: [11] [ 100/6250] eta: 0:35:19 lr: 0.000124 grad: 0.0795 (0.0885) loss: 0.8505 (0.8485) time: 0.2734 data: 0.0003 max mem: 26157 Train: [11] [ 200/6250] eta: 0:31:04 lr: 0.000124 grad: 0.0760 (0.0855) loss: 0.8417 (0.8458) time: 0.2741 data: 0.0003 max mem: 26157 Train: [11] [ 300/6250] eta: 0:29:17 lr: 0.000124 grad: 0.0713 (0.0823) loss: 0.8458 (0.8456) time: 0.2697 data: 0.0002 max mem: 26157 Train: [11] [ 400/6250] eta: 0:28:10 lr: 0.000124 grad: 0.0768 (0.0807) loss: 0.8426 (0.8449) time: 0.2700 data: 0.0002 max mem: 26157 Train: [11] [ 500/6250] eta: 0:27:20 lr: 0.000124 grad: 0.0741 (0.0792) loss: 0.8403 (0.8444) time: 0.2699 data: 0.0002 max mem: 26157 Train: [11] [ 600/6250] eta: 0:26:37 lr: 0.000124 grad: 0.0697 (0.0790) loss: 0.8455 (0.8442) time: 0.2696 data: 0.0002 max mem: 26157 Train: [11] [ 700/6250] eta: 0:25:59 lr: 0.000124 grad: 0.0698 (0.0787) loss: 0.8451 (0.8440) time: 0.2723 data: 0.0002 max mem: 26157 Train: [11] [ 800/6250] eta: 0:25:24 lr: 0.000124 grad: 0.0734 (0.0786) loss: 0.8475 (0.8440) time: 0.2691 data: 0.0002 max mem: 26157 Train: [11] [ 900/6250] eta: 0:24:50 lr: 0.000124 grad: 0.0713 (0.0782) loss: 0.8447 (0.8438) time: 0.2697 data: 0.0002 max mem: 26157 Train: [11] [1000/6250] eta: 0:24:18 lr: 0.000124 grad: 0.0694 (0.0777) loss: 0.8448 (0.8440) time: 0.2708 data: 0.0002 max mem: 26157 Train: [11] [1100/6250] eta: 0:24:04 lr: 0.000124 grad: 0.0674 (0.0771) loss: 0.8424 (0.8440) time: 0.2696 data: 0.0002 max mem: 26157 Train: [11] [1200/6250] eta: 0:23:56 lr: 0.000124 grad: 0.0701 (0.0767) loss: 0.8517 (0.8441) time: 0.2696 data: 0.0002 max mem: 26157 Train: [11] [1300/6250] eta: 0:23:23 lr: 0.000124 grad: 0.0727 (0.0766) loss: 0.8431 (0.8441) time: 0.2689 data: 0.0002 max mem: 26157 Train: [11] [1400/6250] eta: 0:22:49 lr: 0.000124 grad: 0.0734 (0.0763) loss: 0.8467 (0.8441) time: 0.2677 data: 0.0002 max mem: 26157 Train: [11] [1500/6250] eta: 0:22:17 lr: 0.000124 grad: 0.0702 (0.0761) loss: 0.8461 (0.8440) time: 0.2714 data: 0.0002 max mem: 26157 Train: [11] [1600/6250] eta: 0:21:45 lr: 0.000124 grad: 0.0708 (0.0758) loss: 0.8386 (0.8439) time: 0.2690 data: 0.0002 max mem: 26157 Train: [11] [1700/6250] eta: 0:21:14 lr: 0.000124 grad: 0.0702 (0.0758) loss: 0.8380 (0.8437) time: 0.2683 data: 0.0002 max mem: 26157 Train: [11] [1800/6250] eta: 0:20:43 lr: 0.000124 grad: 0.0731 (0.0758) loss: 0.8402 (0.8436) time: 0.2701 data: 0.0002 max mem: 26157 Train: [11] [1900/6250] eta: 0:20:13 lr: 0.000124 grad: 0.0717 (0.0756) loss: 0.8421 (0.8434) time: 0.2697 data: 0.0002 max mem: 26157 Train: [11] [2000/6250] eta: 0:19:43 lr: 0.000124 grad: 0.0721 (0.0754) loss: 0.8413 (0.8433) time: 0.2685 data: 0.0002 max mem: 26157 Train: [11] [2100/6250] eta: 0:19:13 lr: 0.000124 grad: 0.0705 (0.0754) loss: 0.8436 (0.8431) time: 0.2688 data: 0.0002 max mem: 26157 Train: [11] [2200/6250] eta: 0:18:44 lr: 0.000124 grad: 0.0729 (0.0754) loss: 0.8363 (0.8429) time: 0.2701 data: 0.0002 max mem: 26157 Train: [11] [2300/6250] eta: 0:18:15 lr: 0.000124 grad: 0.0725 (0.0754) loss: 0.8333 (0.8427) time: 0.2708 data: 0.0002 max mem: 26157 Train: [11] [2400/6250] eta: 0:17:46 lr: 0.000124 grad: 0.0722 (0.0753) loss: 0.8385 (0.8425) time: 0.2748 data: 0.0002 max mem: 26157 Train: [11] [2500/6250] eta: 0:17:30 lr: 0.000124 grad: 0.0697 (0.0753) loss: 0.8384 (0.8423) time: 0.2687 data: 0.0002 max mem: 26157 Train: [11] [2600/6250] eta: 0:17:00 lr: 0.000124 grad: 0.0753 (0.0753) loss: 0.8400 (0.8423) time: 0.2701 data: 0.0002 max mem: 26157 Train: [11] [2700/6250] eta: 0:16:31 lr: 0.000124 grad: 0.0708 (0.0752) loss: 0.8409 (0.8422) time: 0.2691 data: 0.0002 max mem: 26157 Train: [11] [2800/6250] eta: 0:16:02 lr: 0.000124 grad: 0.0757 (0.0753) loss: 0.8417 (0.8420) time: 0.2679 data: 0.0002 max mem: 26157 Train: [11] [2900/6250] eta: 0:15:33 lr: 0.000124 grad: 0.0765 (0.0754) loss: 0.8331 (0.8419) time: 0.2688 data: 0.0002 max mem: 26157 Train: [11] [3000/6250] eta: 0:15:04 lr: 0.000124 grad: 0.0768 (0.0753) loss: 0.8358 (0.8418) time: 0.2722 data: 0.0002 max mem: 26157 Train: [11] [3100/6250] eta: 0:14:35 lr: 0.000124 grad: 0.0725 (0.0753) loss: 0.8413 (0.8417) time: 0.2707 data: 0.0002 max mem: 26157 Train: [11] [3200/6250] eta: 0:14:06 lr: 0.000124 grad: 0.0706 (0.0753) loss: 0.8377 (0.8416) time: 0.2680 data: 0.0002 max mem: 26157 Train: [11] [3300/6250] eta: 0:13:38 lr: 0.000124 grad: 0.0685 (0.0753) loss: 0.8405 (0.8415) time: 0.2677 data: 0.0001 max mem: 26157 Train: [11] [3400/6250] eta: 0:13:10 lr: 0.000124 grad: 0.0671 (0.0752) loss: 0.8381 (0.8414) time: 0.2718 data: 0.0002 max mem: 26157 Train: [11] [3500/6250] eta: 0:12:41 lr: 0.000124 grad: 0.0715 (0.0752) loss: 0.8394 (0.8414) time: 0.2754 data: 0.0003 max mem: 26157 Train: [11] [3600/6250] eta: 0:12:13 lr: 0.000124 grad: 0.0686 (0.0751) loss: 0.8409 (0.8413) time: 0.2706 data: 0.0002 max mem: 26157 Train: [11] [3700/6250] eta: 0:11:45 lr: 0.000124 grad: 0.0694 (0.0750) loss: 0.8349 (0.8413) time: 0.2697 data: 0.0002 max mem: 26157 Train: [11] [3800/6250] eta: 0:11:17 lr: 0.000124 grad: 0.0740 (0.0750) loss: 0.8371 (0.8413) time: 0.2707 data: 0.0002 max mem: 26157 Train: [11] [3900/6250] eta: 0:10:49 lr: 0.000124 grad: 0.0733 (0.0750) loss: 0.8421 (0.8412) time: 0.2706 data: 0.0002 max mem: 26157 Train: [11] [4000/6250] eta: 0:10:21 lr: 0.000123 grad: 0.0706 (0.0750) loss: 0.8389 (0.8412) time: 0.2687 data: 0.0002 max mem: 26157 Train: [11] [4100/6250] eta: 0:09:53 lr: 0.000123 grad: 0.0711 (0.0750) loss: 0.8401 (0.8411) time: 0.2706 data: 0.0002 max mem: 26157 Train: [11] [4200/6250] eta: 0:09:25 lr: 0.000123 grad: 0.0714 (0.0749) loss: 0.8368 (0.8411) time: 0.2685 data: 0.0002 max mem: 26157 Train: [11] [4300/6250] eta: 0:08:57 lr: 0.000123 grad: 0.0698 (0.0750) loss: 0.8400 (0.8410) time: 0.2691 data: 0.0002 max mem: 26157 Train: [11] [4400/6250] eta: 0:08:29 lr: 0.000123 grad: 0.0707 (0.0750) loss: 0.8396 (0.8409) time: 0.2708 data: 0.0002 max mem: 26157 Train: [11] [4500/6250] eta: 0:08:01 lr: 0.000123 grad: 0.0760 (0.0750) loss: 0.8310 (0.8408) time: 0.2681 data: 0.0001 max mem: 26157 Train: [11] [4600/6250] eta: 0:07:34 lr: 0.000123 grad: 0.0733 (0.0750) loss: 0.8322 (0.8408) time: 0.2688 data: 0.0002 max mem: 26157 Train: [11] [4700/6250] eta: 0:07:06 lr: 0.000123 grad: 0.0703 (0.0749) loss: 0.8374 (0.8407) time: 0.2679 data: 0.0001 max mem: 26157 Train: [11] [4800/6250] eta: 0:06:38 lr: 0.000123 grad: 0.0731 (0.0749) loss: 0.8387 (0.8406) time: 0.2706 data: 0.0002 max mem: 26157 Train: [11] [4900/6250] eta: 0:06:11 lr: 0.000123 grad: 0.0726 (0.0749) loss: 0.8351 (0.8405) time: 0.2697 data: 0.0002 max mem: 26157 Train: [11] [5000/6250] eta: 0:05:43 lr: 0.000123 grad: 0.0738 (0.0749) loss: 0.8359 (0.8404) time: 0.2734 data: 0.0002 max mem: 26157 Train: [11] [5100/6250] eta: 0:05:16 lr: 0.000123 grad: 0.0697 (0.0749) loss: 0.8331 (0.8403) time: 0.2693 data: 0.0002 max mem: 26157 Train: [11] [5200/6250] eta: 0:04:48 lr: 0.000123 grad: 0.0707 (0.0749) loss: 0.8340 (0.8402) time: 0.2696 data: 0.0002 max mem: 26157 Train: [11] [5300/6250] eta: 0:04:20 lr: 0.000123 grad: 0.0723 (0.0749) loss: 0.8328 (0.8401) time: 0.2709 data: 0.0002 max mem: 26157 Train: [11] [5400/6250] eta: 0:03:53 lr: 0.000123 grad: 0.0755 (0.0749) loss: 0.8368 (0.8401) time: 0.3496 data: 0.0787 max mem: 26157 Train: [11] [5500/6250] eta: 0:03:26 lr: 0.000123 grad: 0.0772 (0.0749) loss: 0.8396 (0.8400) time: 0.2686 data: 0.0002 max mem: 26157 Train: [11] [5600/6250] eta: 0:02:58 lr: 0.000123 grad: 0.0733 (0.0749) loss: 0.8359 (0.8399) time: 0.2682 data: 0.0001 max mem: 26157 Train: [11] [5700/6250] eta: 0:02:31 lr: 0.000123 grad: 0.0724 (0.0749) loss: 0.8351 (0.8399) time: 0.2704 data: 0.0002 max mem: 26157 Train: [11] [5800/6250] eta: 0:02:03 lr: 0.000123 grad: 0.0732 (0.0749) loss: 0.8365 (0.8399) time: 0.2695 data: 0.0002 max mem: 26157 Train: [11] [5900/6250] eta: 0:01:36 lr: 0.000123 grad: 0.0709 (0.0748) loss: 0.8397 (0.8398) time: 0.2729 data: 0.0002 max mem: 26157 Train: [11] [6000/6250] eta: 0:01:08 lr: 0.000123 grad: 0.0734 (0.0748) loss: 0.8341 (0.8398) time: 0.2688 data: 0.0002 max mem: 26157 Train: [11] [6100/6250] eta: 0:00:41 lr: 0.000123 grad: 0.0735 (0.0748) loss: 0.8358 (0.8398) time: 0.2711 data: 0.0002 max mem: 26157 Train: [11] [6200/6250] eta: 0:00:13 lr: 0.000123 grad: 0.0722 (0.0747) loss: 0.8361 (0.8397) time: 0.2691 data: 0.0002 max mem: 26157 Train: [11] [6249/6250] eta: 0:00:00 lr: 0.000123 grad: 0.0715 (0.0747) loss: 0.8368 (0.8397) time: 0.2687 data: 0.0002 max mem: 26157 Train: [11] Total time: 0:28:40 (0.2752 s / it) Averaged stats: lr: 0.000123 grad: 0.0715 (0.0747) loss: 0.8368 (0.8397) Eval (hcp-train-subset): [11] [ 0/62] eta: 0:05:37 loss: 0.8532 (0.8532) time: 5.4490 data: 5.3648 max mem: 26157 Eval (hcp-train-subset): [11] [61/62] eta: 0:00:00 loss: 0.8456 (0.8458) time: 0.1118 data: 0.0291 max mem: 26157 Eval (hcp-train-subset): [11] Total time: 0:00:12 (0.2056 s / it) Averaged stats (hcp-train-subset): loss: 0.8456 (0.8458) Making plots (hcp-train-subset): example=41 Eval (hcp-val): [11] [ 0/62] eta: 0:04:19 loss: 0.8391 (0.8391) time: 4.1866 data: 4.0725 max mem: 26157 Eval (hcp-val): [11] [61/62] eta: 0:00:00 loss: 0.8400 (0.8408) time: 0.1236 data: 0.0406 max mem: 26157 Eval (hcp-val): [11] Total time: 0:00:13 (0.2216 s / it) Averaged stats (hcp-val): loss: 0.8400 (0.8408) Making plots (hcp-val): example=33 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [12] [ 0/6250] eta: 7:02:41 lr: 0.000123 grad: 0.0528 (0.0528) loss: 0.8708 (0.8708) time: 4.0578 data: 3.6931 max mem: 26157 Train: [12] [ 100/6250] eta: 0:33:44 lr: 0.000123 grad: 0.0809 (0.0763) loss: 0.8501 (0.8551) time: 0.2725 data: 0.0002 max mem: 26157 Train: [12] [ 200/6250] eta: 0:30:17 lr: 0.000123 grad: 0.0687 (0.0754) loss: 0.8470 (0.8500) time: 0.2727 data: 0.0002 max mem: 26157 Train: [12] [ 300/6250] eta: 0:28:45 lr: 0.000123 grad: 0.0694 (0.0741) loss: 0.8481 (0.8485) time: 0.2689 data: 0.0002 max mem: 26157 Train: [12] [ 400/6250] eta: 0:27:46 lr: 0.000123 grad: 0.0778 (0.0750) loss: 0.8408 (0.8469) time: 0.2688 data: 0.0002 max mem: 26157 Train: [12] [ 500/6250] eta: 0:27:00 lr: 0.000123 grad: 0.0716 (0.0742) loss: 0.8373 (0.8456) time: 0.2689 data: 0.0002 max mem: 26157 Train: [12] [ 600/6250] eta: 0:26:19 lr: 0.000123 grad: 0.0743 (0.0742) loss: 0.8387 (0.8441) time: 0.2699 data: 0.0002 max mem: 26157 Train: [12] [ 700/6250] eta: 0:25:44 lr: 0.000123 grad: 0.0711 (0.0746) loss: 0.8423 (0.8432) time: 0.2711 data: 0.0002 max mem: 26157 Train: [12] [ 800/6250] eta: 0:25:10 lr: 0.000123 grad: 0.0752 (0.0745) loss: 0.8408 (0.8424) time: 0.2693 data: 0.0002 max mem: 26157 Train: [12] [ 900/6250] eta: 0:24:40 lr: 0.000123 grad: 0.0679 (0.0744) loss: 0.8387 (0.8419) time: 0.2699 data: 0.0003 max mem: 26157 Train: [12] [1000/6250] eta: 0:24:08 lr: 0.000123 grad: 0.0702 (0.0741) loss: 0.8368 (0.8414) time: 0.2709 data: 0.0003 max mem: 26157 Train: [12] [1100/6250] eta: 0:23:39 lr: 0.000123 grad: 0.0686 (0.0742) loss: 0.8398 (0.8411) time: 0.2726 data: 0.0002 max mem: 26157 Train: [12] [1200/6250] eta: 0:23:09 lr: 0.000123 grad: 0.0678 (0.0740) loss: 0.8373 (0.8407) time: 0.2699 data: 0.0002 max mem: 26157 Train: [12] [1300/6250] eta: 0:23:00 lr: 0.000123 grad: 0.0706 (0.0739) loss: 0.8367 (0.8405) time: 0.5363 data: 0.2641 max mem: 26157 Train: [12] [1400/6250] eta: 0:22:29 lr: 0.000123 grad: 0.0708 (0.0738) loss: 0.8339 (0.8403) time: 0.2691 data: 0.0002 max mem: 26157 Train: [12] [1500/6250] eta: 0:21:59 lr: 0.000123 grad: 0.0716 (0.0738) loss: 0.8384 (0.8402) time: 0.2755 data: 0.0002 max mem: 26157 Train: [12] [1600/6250] eta: 0:21:29 lr: 0.000123 grad: 0.0670 (0.0736) loss: 0.8393 (0.8400) time: 0.2702 data: 0.0002 max mem: 26157 Train: [12] [1700/6250] eta: 0:20:59 lr: 0.000123 grad: 0.0713 (0.0735) loss: 0.8377 (0.8399) time: 0.2686 data: 0.0002 max mem: 26157 Train: [12] [1800/6250] eta: 0:20:29 lr: 0.000123 grad: 0.0713 (0.0735) loss: 0.8378 (0.8398) time: 0.2679 data: 0.0002 max mem: 26157 Train: [12] [1900/6250] eta: 0:20:00 lr: 0.000123 grad: 0.0675 (0.0734) loss: 0.8422 (0.8397) time: 0.2677 data: 0.0002 max mem: 26157 Train: [12] [2000/6250] eta: 0:19:31 lr: 0.000123 grad: 0.0694 (0.0733) loss: 0.8411 (0.8397) time: 0.2684 data: 0.0002 max mem: 26157 Train: [12] [2100/6250] eta: 0:19:02 lr: 0.000123 grad: 0.0650 (0.0731) loss: 0.8385 (0.8398) time: 0.2698 data: 0.0002 max mem: 26157 Train: [12] [2200/6250] eta: 0:18:34 lr: 0.000123 grad: 0.0694 (0.0730) loss: 0.8361 (0.8399) time: 0.2689 data: 0.0002 max mem: 26157 Train: [12] [2300/6250] eta: 0:18:05 lr: 0.000123 grad: 0.0710 (0.0730) loss: 0.8423 (0.8399) time: 0.2701 data: 0.0002 max mem: 26157 Train: [12] [2400/6250] eta: 0:17:37 lr: 0.000123 grad: 0.0687 (0.0729) loss: 0.8395 (0.8398) time: 0.2678 data: 0.0002 max mem: 26157 Train: [12] [2500/6250] eta: 0:17:09 lr: 0.000123 grad: 0.0696 (0.0729) loss: 0.8309 (0.8397) time: 0.2685 data: 0.0002 max mem: 26157 Train: [12] [2600/6250] eta: 0:16:40 lr: 0.000123 grad: 0.0719 (0.0729) loss: 0.8374 (0.8397) time: 0.2689 data: 0.0002 max mem: 26157 Train: [12] [2700/6250] eta: 0:16:12 lr: 0.000123 grad: 0.0720 (0.0729) loss: 0.8361 (0.8398) time: 0.2706 data: 0.0002 max mem: 26157 Train: [12] [2800/6250] eta: 0:15:44 lr: 0.000123 grad: 0.0709 (0.0728) loss: 0.8426 (0.8398) time: 0.2715 data: 0.0002 max mem: 26157 Train: [12] [2900/6250] eta: 0:15:17 lr: 0.000123 grad: 0.0710 (0.0727) loss: 0.8343 (0.8398) time: 0.2694 data: 0.0002 max mem: 26157 Train: [12] [3000/6250] eta: 0:14:49 lr: 0.000123 grad: 0.0703 (0.0727) loss: 0.8369 (0.8397) time: 0.2696 data: 0.0002 max mem: 26157 Train: [12] [3100/6250] eta: 0:14:21 lr: 0.000123 grad: 0.0723 (0.0728) loss: 0.8328 (0.8395) time: 0.2707 data: 0.0002 max mem: 26157 Train: [12] [3200/6250] eta: 0:13:53 lr: 0.000123 grad: 0.0704 (0.0728) loss: 0.8387 (0.8395) time: 0.2689 data: 0.0002 max mem: 26157 Train: [12] [3300/6250] eta: 0:13:26 lr: 0.000123 grad: 0.0700 (0.0728) loss: 0.8394 (0.8394) time: 0.2681 data: 0.0002 max mem: 26157 Train: [12] [3400/6250] eta: 0:12:58 lr: 0.000123 grad: 0.0687 (0.0729) loss: 0.8369 (0.8393) time: 0.2695 data: 0.0001 max mem: 26157 Train: [12] [3500/6250] eta: 0:12:31 lr: 0.000123 grad: 0.0689 (0.0729) loss: 0.8351 (0.8392) time: 0.2709 data: 0.0002 max mem: 26157 Train: [12] [3600/6250] eta: 0:12:03 lr: 0.000123 grad: 0.0795 (0.0729) loss: 0.8310 (0.8391) time: 0.2684 data: 0.0002 max mem: 26157 Train: [12] [3700/6250] eta: 0:11:38 lr: 0.000123 grad: 0.0707 (0.0728) loss: 0.8363 (0.8390) time: 0.4406 data: 0.1663 max mem: 26157 Train: [12] [3800/6250] eta: 0:11:10 lr: 0.000123 grad: 0.0736 (0.0729) loss: 0.8368 (0.8389) time: 0.2685 data: 0.0002 max mem: 26157 Train: [12] [3900/6250] eta: 0:10:43 lr: 0.000123 grad: 0.0708 (0.0728) loss: 0.8392 (0.8388) time: 0.2682 data: 0.0002 max mem: 26157 Train: [12] [4000/6250] eta: 0:10:20 lr: 0.000123 grad: 0.0686 (0.0728) loss: 0.8342 (0.8388) time: 0.2698 data: 0.0002 max mem: 26157 Train: [12] [4100/6250] eta: 0:09:54 lr: 0.000123 grad: 0.0692 (0.0727) loss: 0.8356 (0.8387) time: 0.2716 data: 0.0002 max mem: 26157 Train: [12] [4200/6250] eta: 0:09:26 lr: 0.000123 grad: 0.0739 (0.0727) loss: 0.8372 (0.8386) time: 0.2717 data: 0.0002 max mem: 26157 Train: [12] [4300/6250] eta: 0:08:58 lr: 0.000123 grad: 0.0687 (0.0727) loss: 0.8376 (0.8385) time: 0.2690 data: 0.0001 max mem: 26157 Train: [12] [4400/6250] eta: 0:08:30 lr: 0.000123 grad: 0.0706 (0.0727) loss: 0.8352 (0.8385) time: 0.2699 data: 0.0002 max mem: 26157 Train: [12] [4500/6250] eta: 0:08:02 lr: 0.000123 grad: 0.0707 (0.0726) loss: 0.8311 (0.8385) time: 0.2689 data: 0.0002 max mem: 26157 Train: [12] [4600/6250] eta: 0:07:35 lr: 0.000123 grad: 0.0678 (0.0727) loss: 0.8394 (0.8384) time: 0.2672 data: 0.0002 max mem: 26157 Train: [12] [4700/6250] eta: 0:07:07 lr: 0.000123 grad: 0.0701 (0.0727) loss: 0.8326 (0.8383) time: 0.2726 data: 0.0002 max mem: 26157 Train: [12] [4800/6250] eta: 0:06:39 lr: 0.000123 grad: 0.0705 (0.0727) loss: 0.8349 (0.8382) time: 0.2688 data: 0.0002 max mem: 26157 Train: [12] [4900/6250] eta: 0:06:11 lr: 0.000123 grad: 0.0711 (0.0727) loss: 0.8290 (0.8381) time: 0.2714 data: 0.0002 max mem: 26157 Train: [12] [5000/6250] eta: 0:05:44 lr: 0.000123 grad: 0.0718 (0.0727) loss: 0.8303 (0.8381) time: 0.2697 data: 0.0002 max mem: 26157 Train: [12] [5100/6250] eta: 0:05:16 lr: 0.000123 grad: 0.0707 (0.0727) loss: 0.8266 (0.8379) time: 0.2712 data: 0.0002 max mem: 26157 Train: [12] [5200/6250] eta: 0:04:48 lr: 0.000123 grad: 0.0751 (0.0727) loss: 0.8323 (0.8378) time: 0.2689 data: 0.0002 max mem: 26157 Train: [12] [5300/6250] eta: 0:04:21 lr: 0.000123 grad: 0.0699 (0.0727) loss: 0.8365 (0.8377) time: 0.2696 data: 0.0002 max mem: 26157 Train: [12] [5400/6250] eta: 0:03:53 lr: 0.000123 grad: 0.0704 (0.0727) loss: 0.8319 (0.8377) time: 0.2685 data: 0.0002 max mem: 26157 Train: [12] [5500/6250] eta: 0:03:26 lr: 0.000123 grad: 0.0724 (0.0727) loss: 0.8311 (0.8376) time: 0.2688 data: 0.0002 max mem: 26157 Train: [12] [5600/6250] eta: 0:02:58 lr: 0.000123 grad: 0.0712 (0.0727) loss: 0.8329 (0.8375) time: 0.2682 data: 0.0002 max mem: 26157 Train: [12] [5700/6250] eta: 0:02:31 lr: 0.000123 grad: 0.0758 (0.0727) loss: 0.8338 (0.8375) time: 0.2696 data: 0.0002 max mem: 26157 Train: [12] [5800/6250] eta: 0:02:03 lr: 0.000123 grad: 0.0703 (0.0727) loss: 0.8325 (0.8373) time: 0.2679 data: 0.0001 max mem: 26157 Train: [12] [5900/6250] eta: 0:01:36 lr: 0.000123 grad: 0.0720 (0.0727) loss: 0.8331 (0.8372) time: 0.2687 data: 0.0002 max mem: 26157 Train: [12] [6000/6250] eta: 0:01:08 lr: 0.000123 grad: 0.0758 (0.0727) loss: 0.8302 (0.8371) time: 0.2685 data: 0.0002 max mem: 26157 Train: [12] [6100/6250] eta: 0:00:41 lr: 0.000123 grad: 0.0701 (0.0727) loss: 0.8350 (0.8371) time: 0.2684 data: 0.0002 max mem: 26157 Train: [12] [6200/6250] eta: 0:00:13 lr: 0.000123 grad: 0.0702 (0.0727) loss: 0.8304 (0.8370) time: 0.2721 data: 0.0002 max mem: 26157 Train: [12] [6249/6250] eta: 0:00:00 lr: 0.000123 grad: 0.0686 (0.0727) loss: 0.8277 (0.8369) time: 0.2703 data: 0.0002 max mem: 26157 Train: [12] Total time: 0:28:38 (0.2750 s / it) Averaged stats: lr: 0.000123 grad: 0.0686 (0.0727) loss: 0.8277 (0.8369) Eval (hcp-train-subset): [12] [ 0/62] eta: 0:05:58 loss: 0.8547 (0.8547) time: 5.7887 data: 5.7046 max mem: 26157 Eval (hcp-train-subset): [12] [61/62] eta: 0:00:00 loss: 0.8446 (0.8441) time: 0.1186 data: 0.0360 max mem: 26157 Eval (hcp-train-subset): [12] Total time: 0:00:13 (0.2145 s / it) Averaged stats (hcp-train-subset): loss: 0.8446 (0.8441) Making plots (hcp-train-subset): example=2 Eval (hcp-val): [12] [ 0/62] eta: 0:05:48 loss: 0.8329 (0.8329) time: 5.6236 data: 5.5318 max mem: 26157 Eval (hcp-val): [12] [61/62] eta: 0:00:00 loss: 0.8365 (0.8386) time: 0.1286 data: 0.0460 max mem: 26157 Eval (hcp-val): [12] Total time: 0:00:13 (0.2201 s / it) Averaged stats (hcp-val): loss: 0.8365 (0.8386) Making plots (hcp-val): example=25 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [13] [ 0/6250] eta: 9:39:16 lr: 0.000123 grad: 0.1255 (0.1255) loss: 0.7730 (0.7730) time: 5.5610 data: 5.2304 max mem: 26157 Train: [13] [ 100/6250] eta: 0:33:55 lr: 0.000123 grad: 0.0721 (0.0859) loss: 0.8452 (0.8412) time: 0.2704 data: 0.0002 max mem: 26157 Train: [13] [ 200/6250] eta: 0:30:19 lr: 0.000123 grad: 0.0735 (0.0826) loss: 0.8388 (0.8403) time: 0.2711 data: 0.0002 max mem: 26157 Train: [13] [ 300/6250] eta: 0:28:49 lr: 0.000123 grad: 0.0763 (0.0810) loss: 0.8361 (0.8394) time: 0.2711 data: 0.0002 max mem: 26157 Train: [13] [ 400/6250] eta: 0:27:52 lr: 0.000123 grad: 0.0707 (0.0798) loss: 0.8382 (0.8387) time: 0.2712 data: 0.0002 max mem: 26157 Train: [13] [ 500/6250] eta: 0:27:09 lr: 0.000123 grad: 0.0677 (0.0784) loss: 0.8337 (0.8377) time: 0.2730 data: 0.0002 max mem: 26157 Train: [13] [ 600/6250] eta: 0:26:30 lr: 0.000123 grad: 0.0712 (0.0774) loss: 0.8379 (0.8372) time: 0.2708 data: 0.0002 max mem: 26157 Train: [13] [ 700/6250] eta: 0:25:55 lr: 0.000123 grad: 0.0665 (0.0765) loss: 0.8337 (0.8366) time: 0.2757 data: 0.0002 max mem: 26157 Train: [13] [ 800/6250] eta: 0:25:21 lr: 0.000123 grad: 0.0693 (0.0760) loss: 0.8381 (0.8365) time: 0.2712 data: 0.0002 max mem: 26157 Train: [13] [ 900/6250] eta: 0:24:48 lr: 0.000123 grad: 0.0697 (0.0755) loss: 0.8324 (0.8364) time: 0.2742 data: 0.0002 max mem: 26157 Train: [13] [1000/6250] eta: 0:24:16 lr: 0.000123 grad: 0.0691 (0.0749) loss: 0.8317 (0.8362) time: 0.2679 data: 0.0002 max mem: 26157 Train: [13] [1100/6250] eta: 0:23:44 lr: 0.000123 grad: 0.0655 (0.0744) loss: 0.8368 (0.8362) time: 0.2692 data: 0.0002 max mem: 26157 Train: [13] [1200/6250] eta: 0:23:13 lr: 0.000123 grad: 0.0662 (0.0740) loss: 0.8364 (0.8362) time: 0.2682 data: 0.0002 max mem: 26157 Train: [13] [1300/6250] eta: 0:22:44 lr: 0.000123 grad: 0.0700 (0.0739) loss: 0.8379 (0.8360) time: 0.2712 data: 0.0002 max mem: 26157 Train: [13] [1400/6250] eta: 0:22:14 lr: 0.000123 grad: 0.0655 (0.0734) loss: 0.8411 (0.8361) time: 0.2688 data: 0.0002 max mem: 26157 Train: [13] [1500/6250] eta: 0:21:44 lr: 0.000123 grad: 0.0713 (0.0734) loss: 0.8357 (0.8361) time: 0.2682 data: 0.0001 max mem: 26157 Train: [13] [1600/6250] eta: 0:21:15 lr: 0.000123 grad: 0.0645 (0.0731) loss: 0.8412 (0.8362) time: 0.2683 data: 0.0002 max mem: 26157 Train: [13] [1700/6250] eta: 0:20:46 lr: 0.000123 grad: 0.0687 (0.0730) loss: 0.8384 (0.8361) time: 0.2700 data: 0.0002 max mem: 26157 Train: [13] [1800/6250] eta: 0:20:18 lr: 0.000123 grad: 0.0710 (0.0729) loss: 0.8389 (0.8362) time: 0.2707 data: 0.0002 max mem: 26157 Train: [13] [1900/6250] eta: 0:19:50 lr: 0.000123 grad: 0.0687 (0.0727) loss: 0.8374 (0.8363) time: 0.2695 data: 0.0002 max mem: 26157 Train: [13] [2000/6250] eta: 0:19:21 lr: 0.000123 grad: 0.0664 (0.0725) loss: 0.8382 (0.8364) time: 0.2686 data: 0.0002 max mem: 26157 Train: [13] [2100/6250] eta: 0:18:54 lr: 0.000123 grad: 0.0708 (0.0724) loss: 0.8342 (0.8365) time: 0.2697 data: 0.0002 max mem: 26157 Train: [13] [2200/6250] eta: 0:18:26 lr: 0.000123 grad: 0.0676 (0.0723) loss: 0.8373 (0.8366) time: 0.2702 data: 0.0002 max mem: 26157 Train: [13] [2300/6250] eta: 0:17:58 lr: 0.000123 grad: 0.0677 (0.0722) loss: 0.8360 (0.8366) time: 0.2684 data: 0.0003 max mem: 26157 Train: [13] [2400/6250] eta: 0:17:30 lr: 0.000123 grad: 0.0745 (0.0722) loss: 0.8376 (0.8366) time: 0.2732 data: 0.0002 max mem: 26157 Train: [13] [2500/6250] eta: 0:17:03 lr: 0.000123 grad: 0.0700 (0.0721) loss: 0.8339 (0.8366) time: 0.2688 data: 0.0002 max mem: 26157 Train: [13] [2600/6250] eta: 0:16:35 lr: 0.000123 grad: 0.0734 (0.0721) loss: 0.8333 (0.8365) time: 0.2683 data: 0.0002 max mem: 26157 Train: [13] [2700/6250] eta: 0:16:07 lr: 0.000123 grad: 0.0751 (0.0721) loss: 0.8312 (0.8364) time: 0.2716 data: 0.0002 max mem: 26157 Train: [13] [2800/6250] eta: 0:15:40 lr: 0.000123 grad: 0.0677 (0.0721) loss: 0.8373 (0.8363) time: 0.2683 data: 0.0002 max mem: 26157 Train: [13] [2900/6250] eta: 0:15:12 lr: 0.000123 grad: 0.0708 (0.0721) loss: 0.8287 (0.8363) time: 0.2678 data: 0.0002 max mem: 26157 Train: [13] [3000/6250] eta: 0:14:45 lr: 0.000123 grad: 0.0700 (0.0721) loss: 0.8373 (0.8362) time: 0.2688 data: 0.0001 max mem: 26157 Train: [13] [3100/6250] eta: 0:14:17 lr: 0.000123 grad: 0.0698 (0.0721) loss: 0.8321 (0.8362) time: 0.2680 data: 0.0002 max mem: 26157 Train: [13] [3200/6250] eta: 0:13:49 lr: 0.000123 grad: 0.0678 (0.0721) loss: 0.8368 (0.8361) time: 0.2679 data: 0.0002 max mem: 26157 Train: [13] [3300/6250] eta: 0:13:22 lr: 0.000123 grad: 0.0691 (0.0720) loss: 0.8326 (0.8360) time: 0.2691 data: 0.0001 max mem: 26157 Train: [13] [3400/6250] eta: 0:12:55 lr: 0.000123 grad: 0.0727 (0.0721) loss: 0.8294 (0.8358) time: 0.2730 data: 0.0002 max mem: 26157 Train: [13] [3500/6250] eta: 0:12:28 lr: 0.000123 grad: 0.0733 (0.0721) loss: 0.8298 (0.8357) time: 0.2945 data: 0.0178 max mem: 26157 Train: [13] [3600/6250] eta: 0:12:00 lr: 0.000123 grad: 0.0745 (0.0722) loss: 0.8365 (0.8356) time: 0.2700 data: 0.0002 max mem: 26157 Train: [13] [3700/6250] eta: 0:11:33 lr: 0.000122 grad: 0.0675 (0.0722) loss: 0.8336 (0.8356) time: 0.2707 data: 0.0002 max mem: 26157 Train: [13] [3800/6250] eta: 0:11:06 lr: 0.000122 grad: 0.0724 (0.0722) loss: 0.8326 (0.8356) time: 0.2697 data: 0.0002 max mem: 26157 Train: [13] [3900/6250] eta: 0:10:38 lr: 0.000122 grad: 0.0724 (0.0724) loss: 0.8310 (0.8355) time: 0.2686 data: 0.0002 max mem: 26157 Train: [13] [4000/6250] eta: 0:10:11 lr: 0.000122 grad: 0.0708 (0.0724) loss: 0.8327 (0.8354) time: 0.2694 data: 0.0002 max mem: 26157 Train: [13] [4100/6250] eta: 0:09:44 lr: 0.000122 grad: 0.0735 (0.0724) loss: 0.8341 (0.8354) time: 0.2721 data: 0.0002 max mem: 26157 Train: [13] [4200/6250] eta: 0:09:16 lr: 0.000122 grad: 0.0701 (0.0724) loss: 0.8361 (0.8353) time: 0.2728 data: 0.0002 max mem: 26157 Train: [13] [4300/6250] eta: 0:08:49 lr: 0.000122 grad: 0.0756 (0.0725) loss: 0.8313 (0.8353) time: 0.2682 data: 0.0002 max mem: 26157 Train: [13] [4400/6250] eta: 0:08:22 lr: 0.000122 grad: 0.0737 (0.0724) loss: 0.8277 (0.8352) time: 0.2688 data: 0.0002 max mem: 26157 Train: [13] [4500/6250] eta: 0:07:55 lr: 0.000122 grad: 0.0709 (0.0725) loss: 0.8294 (0.8351) time: 0.2689 data: 0.0001 max mem: 26157 Train: [13] [4600/6250] eta: 0:07:27 lr: 0.000122 grad: 0.0740 (0.0725) loss: 0.8391 (0.8350) time: 0.2685 data: 0.0002 max mem: 26157 Train: [13] [4700/6250] eta: 0:07:00 lr: 0.000122 grad: 0.0719 (0.0725) loss: 0.8317 (0.8350) time: 0.2706 data: 0.0002 max mem: 26157 Train: [13] [4800/6250] eta: 0:06:33 lr: 0.000122 grad: 0.0731 (0.0725) loss: 0.8334 (0.8349) time: 0.2691 data: 0.0001 max mem: 26157 Train: [13] [4900/6250] eta: 0:06:06 lr: 0.000122 grad: 0.0712 (0.0726) loss: 0.8291 (0.8348) time: 0.2689 data: 0.0002 max mem: 26157 Train: [13] [5000/6250] eta: 0:05:39 lr: 0.000122 grad: 0.0740 (0.0727) loss: 0.8321 (0.8347) time: 0.2702 data: 0.0002 max mem: 26157 Train: [13] [5100/6250] eta: 0:05:11 lr: 0.000122 grad: 0.0725 (0.0727) loss: 0.8308 (0.8347) time: 0.2700 data: 0.0002 max mem: 26157 Train: [13] [5200/6250] eta: 0:04:44 lr: 0.000122 grad: 0.0687 (0.0727) loss: 0.8317 (0.8346) time: 0.2778 data: 0.0003 max mem: 26157 Train: [13] [5300/6250] eta: 0:04:17 lr: 0.000122 grad: 0.0711 (0.0727) loss: 0.8279 (0.8346) time: 0.2709 data: 0.0002 max mem: 26157 Train: [13] [5400/6250] eta: 0:03:50 lr: 0.000122 grad: 0.0708 (0.0727) loss: 0.8306 (0.8345) time: 0.2704 data: 0.0002 max mem: 26157 Train: [13] [5500/6250] eta: 0:03:23 lr: 0.000122 grad: 0.0701 (0.0728) loss: 0.8329 (0.8345) time: 0.2694 data: 0.0002 max mem: 26157 Train: [13] [5600/6250] eta: 0:02:56 lr: 0.000122 grad: 0.0705 (0.0729) loss: 0.8325 (0.8345) time: 0.2675 data: 0.0001 max mem: 26157 Train: [13] [5700/6250] eta: 0:02:29 lr: 0.000122 grad: 0.0721 (0.0729) loss: 0.8287 (0.8344) time: 0.2693 data: 0.0002 max mem: 26157 Train: [13] [5800/6250] eta: 0:02:02 lr: 0.000122 grad: 0.0738 (0.0730) loss: 0.8253 (0.8343) time: 0.2693 data: 0.0002 max mem: 26157 Train: [13] [5900/6250] eta: 0:01:34 lr: 0.000122 grad: 0.0737 (0.0730) loss: 0.8310 (0.8342) time: 0.2678 data: 0.0002 max mem: 26157 Train: [13] [6000/6250] eta: 0:01:07 lr: 0.000122 grad: 0.0701 (0.0731) loss: 0.8319 (0.8341) time: 0.2708 data: 0.0002 max mem: 26157 Train: [13] [6100/6250] eta: 0:00:40 lr: 0.000122 grad: 0.0765 (0.0731) loss: 0.8302 (0.8341) time: 0.2688 data: 0.0002 max mem: 26157 Train: [13] [6200/6250] eta: 0:00:13 lr: 0.000122 grad: 0.0701 (0.0731) loss: 0.8382 (0.8341) time: 0.2704 data: 0.0002 max mem: 26157 Train: [13] [6249/6250] eta: 0:00:00 lr: 0.000122 grad: 0.0682 (0.0731) loss: 0.8338 (0.8341) time: 0.2719 data: 0.0002 max mem: 26157 Train: [13] Total time: 0:28:19 (0.2719 s / it) Averaged stats: lr: 0.000122 grad: 0.0682 (0.0731) loss: 0.8338 (0.8341) Eval (hcp-train-subset): [13] [ 0/62] eta: 0:05:14 loss: 0.8543 (0.8543) time: 5.0772 data: 4.9930 max mem: 26157 Eval (hcp-train-subset): [13] [61/62] eta: 0:00:00 loss: 0.8418 (0.8427) time: 0.1244 data: 0.0399 max mem: 26157 Eval (hcp-train-subset): [13] Total time: 0:00:12 (0.2047 s / it) Averaged stats (hcp-train-subset): loss: 0.8418 (0.8427) Making plots (hcp-train-subset): example=61 Eval (hcp-val): [13] [ 0/62] eta: 0:03:10 loss: 0.8306 (0.8306) time: 3.0745 data: 2.9475 max mem: 26157 Eval (hcp-val): [13] [61/62] eta: 0:00:00 loss: 0.8376 (0.8377) time: 0.1402 data: 0.0572 max mem: 26157 Eval (hcp-val): [13] Total time: 0:00:12 (0.2088 s / it) Averaged stats (hcp-val): loss: 0.8376 (0.8377) Making plots (hcp-val): example=36 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [14] [ 0/6250] eta: 11:17:42 lr: 0.000122 grad: 0.0604 (0.0604) loss: 0.8633 (0.8633) time: 6.5060 data: 6.2310 max mem: 26157 Train: [14] [ 100/6250] eta: 0:35:01 lr: 0.000122 grad: 0.0761 (0.0848) loss: 0.8379 (0.8407) time: 0.2704 data: 0.0002 max mem: 26157 Train: [14] [ 200/6250] eta: 0:30:54 lr: 0.000122 grad: 0.0729 (0.0826) loss: 0.8293 (0.8371) time: 0.2725 data: 0.0002 max mem: 26157 Train: [14] [ 300/6250] eta: 0:29:11 lr: 0.000122 grad: 0.0684 (0.0793) loss: 0.8355 (0.8357) time: 0.2702 data: 0.0002 max mem: 26157 Train: [14] [ 400/6250] eta: 0:28:05 lr: 0.000122 grad: 0.0691 (0.0774) loss: 0.8351 (0.8356) time: 0.2702 data: 0.0002 max mem: 26157 Train: [14] [ 500/6250] eta: 0:27:16 lr: 0.000122 grad: 0.0693 (0.0762) loss: 0.8308 (0.8360) time: 0.2716 data: 0.0002 max mem: 26157 Train: [14] [ 600/6250] eta: 0:26:35 lr: 0.000122 grad: 0.0697 (0.0754) loss: 0.8317 (0.8358) time: 0.2694 data: 0.0002 max mem: 26157 Train: [14] [ 700/6250] eta: 0:25:57 lr: 0.000122 grad: 0.0711 (0.0754) loss: 0.8297 (0.8353) time: 0.2687 data: 0.0002 max mem: 26157 Train: [14] [ 800/6250] eta: 0:25:21 lr: 0.000122 grad: 0.0694 (0.0751) loss: 0.8341 (0.8352) time: 0.2702 data: 0.0002 max mem: 26157 Train: [14] [ 900/6250] eta: 0:24:49 lr: 0.000122 grad: 0.0688 (0.0747) loss: 0.8366 (0.8351) time: 0.2714 data: 0.0002 max mem: 26157 Train: [14] [1000/6250] eta: 0:24:17 lr: 0.000122 grad: 0.0730 (0.0743) loss: 0.8349 (0.8352) time: 0.2729 data: 0.0002 max mem: 26157 Train: [14] [1100/6250] eta: 0:23:46 lr: 0.000122 grad: 0.0714 (0.0740) loss: 0.8361 (0.8353) time: 0.2723 data: 0.0002 max mem: 26157 Train: [14] [1200/6250] eta: 0:23:15 lr: 0.000122 grad: 0.0700 (0.0737) loss: 0.8369 (0.8353) time: 0.2686 data: 0.0002 max mem: 26157 Train: [14] [1300/6250] eta: 0:22:45 lr: 0.000122 grad: 0.0712 (0.0734) loss: 0.8457 (0.8354) time: 0.2687 data: 0.0002 max mem: 26157 Train: [14] [1400/6250] eta: 0:22:15 lr: 0.000122 grad: 0.0690 (0.0731) loss: 0.8384 (0.8355) time: 0.2697 data: 0.0002 max mem: 26157 Train: [14] [1500/6250] eta: 0:21:46 lr: 0.000122 grad: 0.0704 (0.0729) loss: 0.8359 (0.8355) time: 0.2717 data: 0.0002 max mem: 26157 Train: [14] [1600/6250] eta: 0:21:17 lr: 0.000122 grad: 0.0706 (0.0727) loss: 0.8387 (0.8355) time: 0.2712 data: 0.0002 max mem: 26157 Train: [14] [1700/6250] eta: 0:20:49 lr: 0.000122 grad: 0.0680 (0.0726) loss: 0.8346 (0.8356) time: 0.2703 data: 0.0002 max mem: 26157 Train: [14] [1800/6250] eta: 0:20:20 lr: 0.000122 grad: 0.0727 (0.0724) loss: 0.8362 (0.8356) time: 0.2708 data: 0.0002 max mem: 26157 Train: [14] [1900/6250] eta: 0:19:51 lr: 0.000122 grad: 0.0693 (0.0723) loss: 0.8378 (0.8356) time: 0.2694 data: 0.0002 max mem: 26157 Train: [14] [2000/6250] eta: 0:19:23 lr: 0.000122 grad: 0.0707 (0.0722) loss: 0.8363 (0.8356) time: 0.2683 data: 0.0002 max mem: 26157 Train: [14] [2100/6250] eta: 0:18:55 lr: 0.000122 grad: 0.0668 (0.0722) loss: 0.8376 (0.8355) time: 0.2693 data: 0.0002 max mem: 26157 Train: [14] [2200/6250] eta: 0:18:27 lr: 0.000122 grad: 0.0763 (0.0723) loss: 0.8352 (0.8355) time: 0.2682 data: 0.0002 max mem: 26157 Train: [14] [2300/6250] eta: 0:17:58 lr: 0.000122 grad: 0.0693 (0.0722) loss: 0.8340 (0.8355) time: 0.2690 data: 0.0002 max mem: 26157 Train: [14] [2400/6250] eta: 0:17:30 lr: 0.000122 grad: 0.0666 (0.0722) loss: 0.8355 (0.8354) time: 0.2687 data: 0.0002 max mem: 26157 Train: [14] [2500/6250] eta: 0:17:03 lr: 0.000122 grad: 0.0678 (0.0722) loss: 0.8301 (0.8353) time: 0.2711 data: 0.0002 max mem: 26157 Train: [14] [2600/6250] eta: 0:16:35 lr: 0.000122 grad: 0.0723 (0.0723) loss: 0.8323 (0.8353) time: 0.2707 data: 0.0002 max mem: 26157 Train: [14] [2700/6250] eta: 0:16:07 lr: 0.000122 grad: 0.0671 (0.0722) loss: 0.8361 (0.8353) time: 0.2697 data: 0.0002 max mem: 26157 Train: [14] [2800/6250] eta: 0:15:40 lr: 0.000122 grad: 0.0670 (0.0722) loss: 0.8392 (0.8354) time: 0.2687 data: 0.0002 max mem: 26157 Train: [14] [2900/6250] eta: 0:15:12 lr: 0.000122 grad: 0.0687 (0.0721) loss: 0.8346 (0.8353) time: 0.2698 data: 0.0002 max mem: 26157 Train: [14] [3000/6250] eta: 0:14:46 lr: 0.000122 grad: 0.0711 (0.0722) loss: 0.8338 (0.8353) time: 0.3347 data: 0.0590 max mem: 26157 Train: [14] [3100/6250] eta: 0:14:19 lr: 0.000122 grad: 0.0687 (0.0721) loss: 0.8324 (0.8352) time: 0.2718 data: 0.0002 max mem: 26157 Train: [14] [3200/6250] eta: 0:13:58 lr: 0.000122 grad: 0.0731 (0.0721) loss: 0.8312 (0.8351) time: 0.2708 data: 0.0001 max mem: 26157 Train: [14] [3300/6250] eta: 0:13:30 lr: 0.000122 grad: 0.0692 (0.0721) loss: 0.8368 (0.8350) time: 0.2736 data: 0.0002 max mem: 26157 Train: [14] [3400/6250] eta: 0:13:06 lr: 0.000122 grad: 0.0704 (0.0721) loss: 0.8264 (0.8349) time: 0.2715 data: 0.0002 max mem: 26157 Train: [14] [3500/6250] eta: 0:12:38 lr: 0.000122 grad: 0.0663 (0.0721) loss: 0.8308 (0.8348) time: 0.2718 data: 0.0002 max mem: 26157 Train: [14] [3600/6250] eta: 0:12:10 lr: 0.000122 grad: 0.0700 (0.0720) loss: 0.8325 (0.8347) time: 0.2672 data: 0.0002 max mem: 26157 Train: [14] [3700/6250] eta: 0:11:42 lr: 0.000122 grad: 0.0710 (0.0720) loss: 0.8346 (0.8346) time: 0.2684 data: 0.0002 max mem: 26157 Train: [14] [3800/6250] eta: 0:11:14 lr: 0.000122 grad: 0.0704 (0.0720) loss: 0.8319 (0.8345) time: 0.2683 data: 0.0002 max mem: 26157 Train: [14] [3900/6250] eta: 0:10:47 lr: 0.000122 grad: 0.0694 (0.0720) loss: 0.8331 (0.8345) time: 0.2693 data: 0.0002 max mem: 26157 Train: [14] [4000/6250] eta: 0:10:19 lr: 0.000122 grad: 0.0695 (0.0720) loss: 0.8293 (0.8345) time: 0.2690 data: 0.0002 max mem: 26157 Train: [14] [4100/6250] eta: 0:09:51 lr: 0.000122 grad: 0.0660 (0.0720) loss: 0.8412 (0.8345) time: 0.2678 data: 0.0001 max mem: 26157 Train: [14] [4200/6250] eta: 0:09:23 lr: 0.000122 grad: 0.0697 (0.0720) loss: 0.8415 (0.8345) time: 0.2690 data: 0.0001 max mem: 26157 Train: [14] [4300/6250] eta: 0:08:55 lr: 0.000122 grad: 0.0708 (0.0720) loss: 0.8308 (0.8345) time: 0.2711 data: 0.0003 max mem: 26157 Train: [14] [4400/6250] eta: 0:08:28 lr: 0.000122 grad: 0.0718 (0.0720) loss: 0.8320 (0.8345) time: 0.2693 data: 0.0002 max mem: 26157 Train: [14] [4500/6250] eta: 0:08:00 lr: 0.000122 grad: 0.0684 (0.0719) loss: 0.8340 (0.8344) time: 0.2687 data: 0.0002 max mem: 26157 Train: [14] [4600/6250] eta: 0:07:32 lr: 0.000122 grad: 0.0679 (0.0719) loss: 0.8314 (0.8344) time: 0.2700 data: 0.0002 max mem: 26157 Train: [14] [4700/6250] eta: 0:07:05 lr: 0.000122 grad: 0.0687 (0.0719) loss: 0.8318 (0.8343) time: 0.2688 data: 0.0002 max mem: 26157 Train: [14] [4800/6250] eta: 0:06:37 lr: 0.000122 grad: 0.0683 (0.0719) loss: 0.8294 (0.8343) time: 0.2708 data: 0.0002 max mem: 26157 Train: [14] [4900/6250] eta: 0:06:10 lr: 0.000122 grad: 0.0674 (0.0719) loss: 0.8295 (0.8342) time: 0.2699 data: 0.0001 max mem: 26157 Train: [14] [5000/6250] eta: 0:05:42 lr: 0.000122 grad: 0.0684 (0.0719) loss: 0.8301 (0.8342) time: 0.2685 data: 0.0002 max mem: 26157 Train: [14] [5100/6250] eta: 0:05:14 lr: 0.000122 grad: 0.0727 (0.0718) loss: 0.8325 (0.8342) time: 0.2701 data: 0.0002 max mem: 26157 Train: [14] [5200/6250] eta: 0:04:47 lr: 0.000122 grad: 0.0699 (0.0718) loss: 0.8371 (0.8342) time: 0.2691 data: 0.0002 max mem: 26157 Train: [14] [5300/6250] eta: 0:04:20 lr: 0.000122 grad: 0.0715 (0.0717) loss: 0.8349 (0.8342) time: 0.2702 data: 0.0002 max mem: 26157 Train: [14] [5400/6250] eta: 0:03:52 lr: 0.000122 grad: 0.0669 (0.0718) loss: 0.8343 (0.8342) time: 0.2700 data: 0.0002 max mem: 26157 Train: [14] [5500/6250] eta: 0:03:25 lr: 0.000122 grad: 0.0676 (0.0718) loss: 0.8365 (0.8342) time: 0.2718 data: 0.0002 max mem: 26157 Train: [14] [5600/6250] eta: 0:02:57 lr: 0.000122 grad: 0.0759 (0.0718) loss: 0.8314 (0.8342) time: 0.2700 data: 0.0002 max mem: 26157 Train: [14] [5700/6250] eta: 0:02:30 lr: 0.000122 grad: 0.0667 (0.0718) loss: 0.8369 (0.8342) time: 0.2722 data: 0.0002 max mem: 26157 Train: [14] [5800/6250] eta: 0:02:03 lr: 0.000122 grad: 0.0673 (0.0718) loss: 0.8329 (0.8342) time: 0.2696 data: 0.0002 max mem: 26157 Train: [14] [5900/6250] eta: 0:01:35 lr: 0.000122 grad: 0.0679 (0.0717) loss: 0.8355 (0.8343) time: 0.2700 data: 0.0002 max mem: 26157 Train: [14] [6000/6250] eta: 0:01:08 lr: 0.000122 grad: 0.0702 (0.0717) loss: 0.8346 (0.8343) time: 0.2719 data: 0.0002 max mem: 26157 Train: [14] [6100/6250] eta: 0:00:41 lr: 0.000122 grad: 0.0691 (0.0717) loss: 0.8362 (0.8343) time: 0.2688 data: 0.0001 max mem: 26157 Train: [14] [6200/6250] eta: 0:00:13 lr: 0.000122 grad: 0.0689 (0.0718) loss: 0.8293 (0.8342) time: 0.2698 data: 0.0001 max mem: 26157 Train: [14] [6249/6250] eta: 0:00:00 lr: 0.000122 grad: 0.0690 (0.0718) loss: 0.8328 (0.8342) time: 0.2706 data: 0.0002 max mem: 26157 Train: [14] Total time: 0:28:33 (0.2742 s / it) Averaged stats: lr: 0.000122 grad: 0.0690 (0.0718) loss: 0.8328 (0.8342) Eval (hcp-train-subset): [14] [ 0/62] eta: 0:03:14 loss: 0.8519 (0.8519) time: 3.1363 data: 3.0155 max mem: 26157 Eval (hcp-train-subset): [14] [61/62] eta: 0:00:00 loss: 0.8425 (0.8429) time: 0.1272 data: 0.0437 max mem: 26157 Eval (hcp-train-subset): [14] Total time: 0:00:13 (0.2108 s / it) Averaged stats (hcp-train-subset): loss: 0.8425 (0.8429) Making plots (hcp-train-subset): example=40 Eval (hcp-val): [14] [ 0/62] eta: 0:05:43 loss: 0.8318 (0.8318) time: 5.5418 data: 5.4582 max mem: 26157 Eval (hcp-val): [14] [61/62] eta: 0:00:00 loss: 0.8378 (0.8379) time: 0.1292 data: 0.0445 max mem: 26157 Eval (hcp-val): [14] Total time: 0:00:12 (0.2084 s / it) Averaged stats (hcp-val): loss: 0.8378 (0.8379) Making plots (hcp-val): example=51 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [15] [ 0/6250] eta: 8:26:10 lr: 0.000122 grad: 0.2194 (0.2194) loss: 0.7838 (0.7838) time: 4.8593 data: 4.5079 max mem: 26157 Train: [15] [ 100/6250] eta: 0:34:14 lr: 0.000122 grad: 0.0820 (0.0887) loss: 0.8124 (0.8322) time: 0.2688 data: 0.0002 max mem: 26157 Train: [15] [ 200/6250] eta: 0:30:33 lr: 0.000122 grad: 0.0690 (0.0811) loss: 0.8460 (0.8324) time: 0.2712 data: 0.0002 max mem: 26157 Train: [15] [ 300/6250] eta: 0:28:56 lr: 0.000122 grad: 0.0688 (0.0787) loss: 0.8308 (0.8318) time: 0.2691 data: 0.0002 max mem: 26157 Train: [15] [ 400/6250] eta: 0:27:56 lr: 0.000122 grad: 0.0695 (0.0775) loss: 0.8264 (0.8316) time: 0.2712 data: 0.0002 max mem: 26157 Train: [15] [ 500/6250] eta: 0:27:07 lr: 0.000122 grad: 0.0735 (0.0768) loss: 0.8289 (0.8310) time: 0.2689 data: 0.0002 max mem: 26157 Train: [15] [ 600/6250] eta: 0:26:31 lr: 0.000122 grad: 0.0671 (0.0758) loss: 0.8306 (0.8310) time: 0.2703 data: 0.0002 max mem: 26157 Train: [15] [ 700/6250] eta: 0:26:07 lr: 0.000122 grad: 0.0647 (0.0751) loss: 0.8318 (0.8311) time: 0.2719 data: 0.0006 max mem: 26157 Train: [15] [ 800/6250] eta: 0:25:41 lr: 0.000122 grad: 0.0657 (0.0741) loss: 0.8328 (0.8315) time: 0.2717 data: 0.0003 max mem: 26157 Train: [15] [ 900/6250] eta: 0:25:05 lr: 0.000122 grad: 0.0653 (0.0735) loss: 0.8354 (0.8320) time: 0.2709 data: 0.0002 max mem: 26157 Train: [15] [1000/6250] eta: 0:24:45 lr: 0.000122 grad: 0.0742 (0.0731) loss: 0.8308 (0.8322) time: 0.2712 data: 0.0002 max mem: 26157 Train: [15] [1100/6250] eta: 0:24:11 lr: 0.000121 grad: 0.0698 (0.0728) loss: 0.8355 (0.8324) time: 0.2689 data: 0.0002 max mem: 26157 Train: [15] [1200/6250] eta: 0:23:45 lr: 0.000121 grad: 0.0683 (0.0726) loss: 0.8326 (0.8325) time: 0.2730 data: 0.0002 max mem: 26157 Train: [15] [1300/6250] eta: 0:23:12 lr: 0.000121 grad: 0.0675 (0.0725) loss: 0.8338 (0.8325) time: 0.2691 data: 0.0002 max mem: 26157 Train: [15] [1400/6250] eta: 0:22:39 lr: 0.000121 grad: 0.0662 (0.0724) loss: 0.8355 (0.8325) time: 0.2694 data: 0.0002 max mem: 26157 Train: [15] [1500/6250] eta: 0:22:08 lr: 0.000121 grad: 0.0687 (0.0723) loss: 0.8338 (0.8325) time: 0.2694 data: 0.0002 max mem: 26157 Train: [15] [1600/6250] eta: 0:21:37 lr: 0.000121 grad: 0.0656 (0.0724) loss: 0.8359 (0.8325) time: 0.2696 data: 0.0002 max mem: 26157 Train: [15] [1700/6250] eta: 0:21:06 lr: 0.000121 grad: 0.0653 (0.0722) loss: 0.8361 (0.8326) time: 0.2692 data: 0.0002 max mem: 26157 Train: [15] [1800/6250] eta: 0:20:36 lr: 0.000121 grad: 0.0716 (0.0723) loss: 0.8293 (0.8325) time: 0.2683 data: 0.0002 max mem: 26157 Train: [15] [1900/6250] eta: 0:20:06 lr: 0.000121 grad: 0.0689 (0.0723) loss: 0.8337 (0.8324) time: 0.2683 data: 0.0002 max mem: 26157 Train: [15] [2000/6250] eta: 0:19:36 lr: 0.000121 grad: 0.0697 (0.0723) loss: 0.8328 (0.8324) time: 0.2680 data: 0.0002 max mem: 26157 Train: [15] [2100/6250] eta: 0:19:07 lr: 0.000121 grad: 0.0740 (0.0724) loss: 0.8320 (0.8324) time: 0.2693 data: 0.0002 max mem: 26157 Train: [15] [2200/6250] eta: 0:18:38 lr: 0.000121 grad: 0.0681 (0.0724) loss: 0.8273 (0.8323) time: 0.2719 data: 0.0002 max mem: 26157 Train: [15] [2300/6250] eta: 0:18:09 lr: 0.000121 grad: 0.0707 (0.0723) loss: 0.8337 (0.8323) time: 0.2704 data: 0.0002 max mem: 26157 Train: [15] [2400/6250] eta: 0:17:41 lr: 0.000121 grad: 0.0669 (0.0723) loss: 0.8346 (0.8323) time: 0.2690 data: 0.0002 max mem: 26157 Train: [15] [2500/6250] eta: 0:17:12 lr: 0.000121 grad: 0.0708 (0.0725) loss: 0.8350 (0.8323) time: 0.2675 data: 0.0002 max mem: 26157 Train: [15] [2600/6250] eta: 0:16:44 lr: 0.000121 grad: 0.0722 (0.0726) loss: 0.8343 (0.8322) time: 0.2691 data: 0.0002 max mem: 26157 Train: [15] [2700/6250] eta: 0:16:16 lr: 0.000121 grad: 0.0691 (0.0726) loss: 0.8264 (0.8322) time: 0.2711 data: 0.0002 max mem: 26157 Train: [15] [2800/6250] eta: 0:15:48 lr: 0.000121 grad: 0.0685 (0.0727) loss: 0.8299 (0.8321) time: 0.2690 data: 0.0002 max mem: 26157 Train: [15] [2900/6250] eta: 0:15:20 lr: 0.000121 grad: 0.0702 (0.0727) loss: 0.8281 (0.8320) time: 0.2705 data: 0.0002 max mem: 26157 Train: [15] [3000/6250] eta: 0:14:52 lr: 0.000121 grad: 0.0677 (0.0728) loss: 0.8287 (0.8319) time: 0.2704 data: 0.0002 max mem: 26157 Train: [15] [3100/6250] eta: 0:14:24 lr: 0.000121 grad: 0.0680 (0.0728) loss: 0.8298 (0.8318) time: 0.2699 data: 0.0002 max mem: 26157 Train: [15] [3200/6250] eta: 0:13:56 lr: 0.000121 grad: 0.0747 (0.0728) loss: 0.8292 (0.8318) time: 0.2696 data: 0.0002 max mem: 26157 Train: [15] [3300/6250] eta: 0:13:28 lr: 0.000121 grad: 0.0712 (0.0728) loss: 0.8360 (0.8318) time: 0.2680 data: 0.0002 max mem: 26157 Train: [15] [3400/6250] eta: 0:13:01 lr: 0.000121 grad: 0.0673 (0.0728) loss: 0.8354 (0.8319) time: 0.2695 data: 0.0002 max mem: 26157 Train: [15] [3500/6250] eta: 0:12:33 lr: 0.000121 grad: 0.0661 (0.0728) loss: 0.8339 (0.8319) time: 0.2703 data: 0.0002 max mem: 26157 Train: [15] [3600/6250] eta: 0:12:05 lr: 0.000121 grad: 0.0702 (0.0727) loss: 0.8306 (0.8318) time: 0.2681 data: 0.0002 max mem: 26157 Train: [15] [3700/6250] eta: 0:11:37 lr: 0.000121 grad: 0.0675 (0.0727) loss: 0.8296 (0.8318) time: 0.2701 data: 0.0001 max mem: 26157 Train: [15] [3800/6250] eta: 0:11:10 lr: 0.000121 grad: 0.0644 (0.0726) loss: 0.8379 (0.8319) time: 0.2684 data: 0.0002 max mem: 26157 Train: [15] [3900/6250] eta: 0:10:42 lr: 0.000121 grad: 0.0704 (0.0726) loss: 0.8298 (0.8319) time: 0.2690 data: 0.0002 max mem: 26157 Train: [15] [4000/6250] eta: 0:10:15 lr: 0.000121 grad: 0.0710 (0.0726) loss: 0.8339 (0.8318) time: 0.2684 data: 0.0002 max mem: 26157 Train: [15] [4100/6250] eta: 0:09:47 lr: 0.000121 grad: 0.0711 (0.0726) loss: 0.8362 (0.8319) time: 0.2693 data: 0.0002 max mem: 26157 Train: [15] [4200/6250] eta: 0:09:19 lr: 0.000121 grad: 0.0715 (0.0726) loss: 0.8341 (0.8319) time: 0.2694 data: 0.0002 max mem: 26157 Train: [15] [4300/6250] eta: 0:08:52 lr: 0.000121 grad: 0.0686 (0.0726) loss: 0.8323 (0.8319) time: 0.2677 data: 0.0002 max mem: 26157 Train: [15] [4400/6250] eta: 0:08:24 lr: 0.000121 grad: 0.0709 (0.0726) loss: 0.8258 (0.8318) time: 0.2685 data: 0.0002 max mem: 26157 Train: [15] [4500/6250] eta: 0:07:57 lr: 0.000121 grad: 0.0720 (0.0726) loss: 0.8301 (0.8318) time: 0.2717 data: 0.0002 max mem: 26157 Train: [15] [4600/6250] eta: 0:07:30 lr: 0.000121 grad: 0.0686 (0.0726) loss: 0.8306 (0.8317) time: 0.2698 data: 0.0002 max mem: 26157 Train: [15] [4700/6250] eta: 0:07:03 lr: 0.000121 grad: 0.0750 (0.0726) loss: 0.8285 (0.8317) time: 0.2691 data: 0.0002 max mem: 26157 Train: [15] [4800/6250] eta: 0:06:35 lr: 0.000121 grad: 0.0734 (0.0726) loss: 0.8309 (0.8316) time: 0.2703 data: 0.0002 max mem: 26157 Train: [15] [4900/6250] eta: 0:06:08 lr: 0.000121 grad: 0.0688 (0.0726) loss: 0.8355 (0.8316) time: 0.2688 data: 0.0002 max mem: 26157 Train: [15] [5000/6250] eta: 0:05:40 lr: 0.000121 grad: 0.0679 (0.0726) loss: 0.8344 (0.8316) time: 0.2680 data: 0.0002 max mem: 26157 Train: [15] [5100/6250] eta: 0:05:13 lr: 0.000121 grad: 0.0671 (0.0726) loss: 0.8355 (0.8316) time: 0.2716 data: 0.0002 max mem: 26157 Train: [15] [5200/6250] eta: 0:04:46 lr: 0.000121 grad: 0.0715 (0.0726) loss: 0.8249 (0.8315) time: 0.2760 data: 0.0002 max mem: 26157 Train: [15] [5300/6250] eta: 0:04:19 lr: 0.000121 grad: 0.0720 (0.0726) loss: 0.8329 (0.8316) time: 0.2692 data: 0.0002 max mem: 26157 Train: [15] [5400/6250] eta: 0:03:52 lr: 0.000121 grad: 0.0701 (0.0726) loss: 0.8348 (0.8316) time: 0.2700 data: 0.0002 max mem: 26157 Train: [15] [5500/6250] eta: 0:03:25 lr: 0.000121 grad: 0.0669 (0.0725) loss: 0.8370 (0.8316) time: 0.2685 data: 0.0002 max mem: 26157 Train: [15] [5600/6250] eta: 0:02:57 lr: 0.000121 grad: 0.0730 (0.0725) loss: 0.8320 (0.8316) time: 0.2678 data: 0.0002 max mem: 26157 Train: [15] [5700/6250] eta: 0:02:30 lr: 0.000121 grad: 0.0733 (0.0725) loss: 0.8311 (0.8316) time: 0.2709 data: 0.0002 max mem: 26157 Train: [15] [5800/6250] eta: 0:02:02 lr: 0.000121 grad: 0.0671 (0.0725) loss: 0.8324 (0.8316) time: 0.2695 data: 0.0002 max mem: 26157 Train: [15] [5900/6250] eta: 0:01:35 lr: 0.000121 grad: 0.0733 (0.0725) loss: 0.8339 (0.8316) time: 0.2676 data: 0.0002 max mem: 26157 Train: [15] [6000/6250] eta: 0:01:08 lr: 0.000121 grad: 0.0672 (0.0725) loss: 0.8318 (0.8316) time: 0.2682 data: 0.0002 max mem: 26157 Train: [15] [6100/6250] eta: 0:00:40 lr: 0.000121 grad: 0.0712 (0.0726) loss: 0.8279 (0.8316) time: 0.2694 data: 0.0002 max mem: 26157 Train: [15] [6200/6250] eta: 0:00:13 lr: 0.000121 grad: 0.0669 (0.0728) loss: 0.8317 (0.8316) time: 0.2697 data: 0.0002 max mem: 26157 Train: [15] [6249/6250] eta: 0:00:00 lr: 0.000121 grad: 0.0730 (0.0728) loss: 0.8327 (0.8317) time: 0.2696 data: 0.0002 max mem: 26157 Train: [15] Total time: 0:28:37 (0.2749 s / it) Averaged stats: lr: 0.000121 grad: 0.0730 (0.0728) loss: 0.8327 (0.8317) Eval (hcp-train-subset): [15] [ 0/62] eta: 0:03:18 loss: 0.8497 (0.8497) time: 3.1989 data: 3.0566 max mem: 26157 Eval (hcp-train-subset): [15] [61/62] eta: 0:00:00 loss: 0.8382 (0.8394) time: 0.1307 data: 0.0464 max mem: 26157 Eval (hcp-train-subset): [15] Total time: 0:00:12 (0.2082 s / it) Averaged stats (hcp-train-subset): loss: 0.8382 (0.8394) Making plots (hcp-train-subset): example=56 Eval (hcp-val): [15] [ 0/62] eta: 0:06:02 loss: 0.8309 (0.8309) time: 5.8393 data: 5.7555 max mem: 26157 Eval (hcp-val): [15] [61/62] eta: 0:00:00 loss: 0.8349 (0.8355) time: 0.1184 data: 0.0357 max mem: 26157 Eval (hcp-val): [15] Total time: 0:00:12 (0.2059 s / it) Averaged stats (hcp-val): loss: 0.8349 (0.8355) Making plots (hcp-val): example=42 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [16] [ 0/6250] eta: 10:29:25 lr: 0.000121 grad: 0.0766 (0.0766) loss: 0.8514 (0.8514) time: 6.0425 data: 5.7599 max mem: 26157 Train: [16] [ 100/6250] eta: 0:33:40 lr: 0.000121 grad: 0.0770 (0.0819) loss: 0.8296 (0.8347) time: 0.2710 data: 0.0002 max mem: 26157 Train: [16] [ 200/6250] eta: 0:30:12 lr: 0.000121 grad: 0.0727 (0.0784) loss: 0.8290 (0.8324) time: 0.2704 data: 0.0002 max mem: 26157 Train: [16] [ 300/6250] eta: 0:28:45 lr: 0.000121 grad: 0.0699 (0.0758) loss: 0.8362 (0.8326) time: 0.2732 data: 0.0002 max mem: 26157 Train: [16] [ 400/6250] eta: 0:27:50 lr: 0.000121 grad: 0.0657 (0.0744) loss: 0.8380 (0.8337) time: 0.2730 data: 0.0002 max mem: 26157 Train: [16] [ 500/6250] eta: 0:27:06 lr: 0.000121 grad: 0.0686 (0.0737) loss: 0.8383 (0.8345) time: 0.2752 data: 0.0002 max mem: 26157 Train: [16] [ 600/6250] eta: 0:26:27 lr: 0.000121 grad: 0.0670 (0.0728) loss: 0.8307 (0.8348) time: 0.2708 data: 0.0002 max mem: 26157 Train: [16] [ 700/6250] eta: 0:25:51 lr: 0.000121 grad: 0.0736 (0.0727) loss: 0.8284 (0.8347) time: 0.2709 data: 0.0002 max mem: 26157 Train: [16] [ 800/6250] eta: 0:25:18 lr: 0.000121 grad: 0.0679 (0.0723) loss: 0.8370 (0.8349) time: 0.2682 data: 0.0001 max mem: 26157 Train: [16] [ 900/6250] eta: 0:24:45 lr: 0.000121 grad: 0.0697 (0.0722) loss: 0.8359 (0.8347) time: 0.2702 data: 0.0002 max mem: 26157 Train: [16] [1000/6250] eta: 0:24:13 lr: 0.000121 grad: 0.0680 (0.0718) loss: 0.8297 (0.8347) time: 0.2678 data: 0.0001 max mem: 26157 Train: [16] [1100/6250] eta: 0:23:42 lr: 0.000121 grad: 0.0688 (0.0717) loss: 0.8326 (0.8344) time: 0.2685 data: 0.0003 max mem: 26157 Train: [16] [1200/6250] eta: 0:23:11 lr: 0.000121 grad: 0.0658 (0.0718) loss: 0.8308 (0.8341) time: 0.2679 data: 0.0001 max mem: 26157 Train: [16] [1300/6250] eta: 0:22:41 lr: 0.000121 grad: 0.0705 (0.0717) loss: 0.8300 (0.8340) time: 0.2678 data: 0.0002 max mem: 26157 Train: [16] [1400/6250] eta: 0:22:12 lr: 0.000121 grad: 0.0717 (0.0721) loss: 0.8290 (0.8336) time: 0.2683 data: 0.0002 max mem: 26157 Train: [16] [1500/6250] eta: 0:21:42 lr: 0.000121 grad: 0.0737 (0.0721) loss: 0.8307 (0.8334) time: 0.2681 data: 0.0003 max mem: 26157 Train: [16] [1600/6250] eta: 0:21:14 lr: 0.000121 grad: 0.0713 (0.0721) loss: 0.8304 (0.8331) time: 0.2729 data: 0.0002 max mem: 26157 Train: [16] [1700/6250] eta: 0:20:50 lr: 0.000121 grad: 0.0664 (0.0721) loss: 0.8331 (0.8329) time: 0.2941 data: 0.0209 max mem: 26157 Train: [16] [1800/6250] eta: 0:20:32 lr: 0.000121 grad: 0.0704 (0.0721) loss: 0.8286 (0.8327) time: 0.2683 data: 0.0002 max mem: 26157 Train: [16] [1900/6250] eta: 0:20:03 lr: 0.000121 grad: 0.0714 (0.0721) loss: 0.8291 (0.8325) time: 0.2703 data: 0.0002 max mem: 26157 Train: [16] [2000/6250] eta: 0:19:34 lr: 0.000121 grad: 0.0681 (0.0721) loss: 0.8341 (0.8324) time: 0.2696 data: 0.0002 max mem: 26157 Train: [16] [2100/6250] eta: 0:19:05 lr: 0.000121 grad: 0.0673 (0.0721) loss: 0.8270 (0.8323) time: 0.2704 data: 0.0002 max mem: 26157 Train: [16] [2200/6250] eta: 0:18:36 lr: 0.000121 grad: 0.0731 (0.0721) loss: 0.8288 (0.8322) time: 0.2693 data: 0.0003 max mem: 26157 Train: [16] [2300/6250] eta: 0:18:07 lr: 0.000121 grad: 0.0679 (0.0721) loss: 0.8302 (0.8321) time: 0.2711 data: 0.0002 max mem: 26157 Train: [16] [2400/6250] eta: 0:17:45 lr: 0.000121 grad: 0.0698 (0.0721) loss: 0.8306 (0.8321) time: 0.4632 data: 0.1944 max mem: 26157 Train: [16] [2500/6250] eta: 0:17:16 lr: 0.000121 grad: 0.0745 (0.0722) loss: 0.8263 (0.8319) time: 0.2701 data: 0.0002 max mem: 26157 Train: [16] [2600/6250] eta: 0:16:48 lr: 0.000121 grad: 0.0751 (0.0723) loss: 0.8297 (0.8318) time: 0.2686 data: 0.0002 max mem: 26157 Train: [16] [2700/6250] eta: 0:16:19 lr: 0.000121 grad: 0.0702 (0.0723) loss: 0.8309 (0.8317) time: 0.2685 data: 0.0002 max mem: 26157 Train: [16] [2800/6250] eta: 0:15:51 lr: 0.000121 grad: 0.0720 (0.0724) loss: 0.8199 (0.8315) time: 0.2709 data: 0.0002 max mem: 26157 Train: [16] [2900/6250] eta: 0:15:23 lr: 0.000121 grad: 0.0708 (0.0725) loss: 0.8258 (0.8313) time: 0.2687 data: 0.0002 max mem: 26157 Train: [16] [3000/6250] eta: 0:14:54 lr: 0.000121 grad: 0.0737 (0.0724) loss: 0.8265 (0.8311) time: 0.2681 data: 0.0002 max mem: 26157 Train: [16] [3100/6250] eta: 0:14:26 lr: 0.000121 grad: 0.0735 (0.0725) loss: 0.8242 (0.8310) time: 0.2711 data: 0.0002 max mem: 26157 Train: [16] [3200/6250] eta: 0:13:58 lr: 0.000121 grad: 0.0698 (0.0725) loss: 0.8269 (0.8309) time: 0.2698 data: 0.0002 max mem: 26157 Train: [16] [3300/6250] eta: 0:13:30 lr: 0.000121 grad: 0.0797 (0.0727) loss: 0.8182 (0.8308) time: 0.2695 data: 0.0002 max mem: 26157 Train: [16] [3400/6250] eta: 0:13:02 lr: 0.000121 grad: 0.0722 (0.0727) loss: 0.8215 (0.8306) time: 0.2684 data: 0.0002 max mem: 26157 Train: [16] [3500/6250] eta: 0:12:34 lr: 0.000120 grad: 0.0710 (0.0728) loss: 0.8277 (0.8305) time: 0.2720 data: 0.0002 max mem: 26157 Train: [16] [3600/6250] eta: 0:12:07 lr: 0.000120 grad: 0.0692 (0.0728) loss: 0.8325 (0.8305) time: 0.2702 data: 0.0002 max mem: 26157 Train: [16] [3700/6250] eta: 0:11:39 lr: 0.000120 grad: 0.0689 (0.0728) loss: 0.8262 (0.8304) time: 0.2713 data: 0.0002 max mem: 26157 Train: [16] [3800/6250] eta: 0:11:11 lr: 0.000120 grad: 0.0769 (0.0729) loss: 0.8264 (0.8303) time: 0.2692 data: 0.0002 max mem: 26157 Train: [16] [3900/6250] eta: 0:10:44 lr: 0.000120 grad: 0.0687 (0.0729) loss: 0.8277 (0.8303) time: 0.2727 data: 0.0002 max mem: 26157 Train: [16] [4000/6250] eta: 0:10:16 lr: 0.000120 grad: 0.0665 (0.0730) loss: 0.8287 (0.8302) time: 0.2698 data: 0.0002 max mem: 26157 Train: [16] [4100/6250] eta: 0:09:48 lr: 0.000120 grad: 0.0741 (0.0732) loss: 0.8214 (0.8301) time: 0.2717 data: 0.0002 max mem: 26157 Train: [16] [4200/6250] eta: 0:09:21 lr: 0.000120 grad: 0.0756 (0.0732) loss: 0.8249 (0.8300) time: 0.2693 data: 0.0002 max mem: 26157 Train: [16] [4300/6250] eta: 0:08:53 lr: 0.000120 grad: 0.0739 (0.0732) loss: 0.8251 (0.8300) time: 0.2722 data: 0.0002 max mem: 26157 Train: [16] [4400/6250] eta: 0:08:26 lr: 0.000120 grad: 0.0726 (0.0733) loss: 0.8247 (0.8299) time: 0.2745 data: 0.0002 max mem: 26157 Train: [16] [4500/6250] eta: 0:07:59 lr: 0.000120 grad: 0.0758 (0.0733) loss: 0.8286 (0.8298) time: 0.2721 data: 0.0002 max mem: 26157 Train: [16] [4600/6250] eta: 0:07:31 lr: 0.000120 grad: 0.0724 (0.0733) loss: 0.8284 (0.8298) time: 0.2725 data: 0.0002 max mem: 26157 Train: [16] [4700/6250] eta: 0:07:04 lr: 0.000120 grad: 0.0754 (0.0733) loss: 0.8310 (0.8298) time: 0.2727 data: 0.0002 max mem: 26157 Train: [16] [4800/6250] eta: 0:06:36 lr: 0.000120 grad: 0.0710 (0.0734) loss: 0.8285 (0.8297) time: 0.2719 data: 0.0002 max mem: 26157 Train: [16] [4900/6250] eta: 0:06:10 lr: 0.000120 grad: 0.0682 (0.0733) loss: 0.8263 (0.8297) time: 0.2696 data: 0.0002 max mem: 26157 Train: [16] [5000/6250] eta: 0:05:43 lr: 0.000120 grad: 0.0702 (0.0733) loss: 0.8266 (0.8297) time: 0.2680 data: 0.0001 max mem: 26157 Train: [16] [5100/6250] eta: 0:05:15 lr: 0.000120 grad: 0.0715 (0.0733) loss: 0.8314 (0.8297) time: 0.2696 data: 0.0003 max mem: 26157 Train: [16] [5200/6250] eta: 0:04:47 lr: 0.000120 grad: 0.0682 (0.0733) loss: 0.8265 (0.8297) time: 0.2678 data: 0.0001 max mem: 26157 Train: [16] [5300/6250] eta: 0:04:20 lr: 0.000120 grad: 0.0722 (0.0734) loss: 0.8316 (0.8297) time: 0.2677 data: 0.0002 max mem: 26157 Train: [16] [5400/6250] eta: 0:03:52 lr: 0.000120 grad: 0.0708 (0.0733) loss: 0.8301 (0.8297) time: 0.2684 data: 0.0002 max mem: 26157 Train: [16] [5500/6250] eta: 0:03:25 lr: 0.000120 grad: 0.0693 (0.0733) loss: 0.8308 (0.8298) time: 0.2706 data: 0.0002 max mem: 26157 Train: [16] [5600/6250] eta: 0:02:57 lr: 0.000120 grad: 0.0721 (0.0733) loss: 0.8324 (0.8298) time: 0.2683 data: 0.0002 max mem: 26157 Train: [16] [5700/6250] eta: 0:02:30 lr: 0.000120 grad: 0.0758 (0.0733) loss: 0.8274 (0.8298) time: 0.2686 data: 0.0002 max mem: 26157 Train: [16] [5800/6250] eta: 0:02:03 lr: 0.000120 grad: 0.0760 (0.0733) loss: 0.8248 (0.8298) time: 0.2686 data: 0.0002 max mem: 26157 Train: [16] [5900/6250] eta: 0:01:35 lr: 0.000120 grad: 0.0712 (0.0733) loss: 0.8291 (0.8298) time: 0.2685 data: 0.0002 max mem: 26157 Train: [16] [6000/6250] eta: 0:01:08 lr: 0.000120 grad: 0.0763 (0.0734) loss: 0.8234 (0.8297) time: 0.2690 data: 0.0002 max mem: 26157 Train: [16] [6100/6250] eta: 0:00:41 lr: 0.000120 grad: 0.0763 (0.0734) loss: 0.8275 (0.8297) time: 0.2684 data: 0.0002 max mem: 26157 Train: [16] [6200/6250] eta: 0:00:13 lr: 0.000120 grad: 0.0684 (0.0734) loss: 0.8336 (0.8297) time: 0.3009 data: 0.0299 max mem: 26157 Train: [16] [6249/6250] eta: 0:00:00 lr: 0.000120 grad: 0.0773 (0.0735) loss: 0.8275 (0.8297) time: 0.2785 data: 0.0002 max mem: 26157 Train: [16] Total time: 0:28:36 (0.2746 s / it) Averaged stats: lr: 0.000120 grad: 0.0773 (0.0735) loss: 0.8275 (0.8297) Eval (hcp-train-subset): [16] [ 0/62] eta: 0:05:25 loss: 0.8519 (0.8519) time: 5.2443 data: 5.1603 max mem: 26157 Eval (hcp-train-subset): [16] [61/62] eta: 0:00:00 loss: 0.8376 (0.8399) time: 0.1137 data: 0.0288 max mem: 26157 Eval (hcp-train-subset): [16] Total time: 0:00:12 (0.2084 s / it) Averaged stats (hcp-train-subset): loss: 0.8376 (0.8399) Making plots (hcp-train-subset): example=57 Eval (hcp-val): [16] [ 0/62] eta: 0:05:03 loss: 0.8335 (0.8335) time: 4.8919 data: 4.8075 max mem: 26157 Eval (hcp-val): [16] [61/62] eta: 0:00:00 loss: 0.8340 (0.8358) time: 0.1336 data: 0.0506 max mem: 26157 Eval (hcp-val): [16] Total time: 0:00:12 (0.2081 s / it) Averaged stats (hcp-val): loss: 0.8340 (0.8358) Making plots (hcp-val): example=27 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [17] [ 0/6250] eta: 10:50:37 lr: 0.000120 grad: 0.0792 (0.0792) loss: 0.8202 (0.8202) time: 6.2460 data: 5.9654 max mem: 26157 Train: [17] [ 100/6250] eta: 0:34:01 lr: 0.000120 grad: 0.0794 (0.0849) loss: 0.8271 (0.8303) time: 0.2700 data: 0.0002 max mem: 26157 Train: [17] [ 200/6250] eta: 0:30:27 lr: 0.000120 grad: 0.0769 (0.0827) loss: 0.8208 (0.8254) time: 0.2719 data: 0.0002 max mem: 26157 Train: [17] [ 300/6250] eta: 0:28:58 lr: 0.000120 grad: 0.0723 (0.0802) loss: 0.8317 (0.8254) time: 0.2732 data: 0.0002 max mem: 26157 Train: [17] [ 400/6250] eta: 0:28:00 lr: 0.000120 grad: 0.0707 (0.0789) loss: 0.8291 (0.8259) time: 0.2713 data: 0.0003 max mem: 26157 Train: [17] [ 500/6250] eta: 0:27:15 lr: 0.000120 grad: 0.0712 (0.0787) loss: 0.8292 (0.8264) time: 0.2721 data: 0.0003 max mem: 26157 Train: [17] [ 600/6250] eta: 0:26:34 lr: 0.000120 grad: 0.0691 (0.0777) loss: 0.8286 (0.8269) time: 0.2701 data: 0.0002 max mem: 26157 Train: [17] [ 700/6250] eta: 0:25:56 lr: 0.000120 grad: 0.0651 (0.0770) loss: 0.8336 (0.8274) time: 0.2702 data: 0.0002 max mem: 26157 Train: [17] [ 800/6250] eta: 0:25:23 lr: 0.000120 grad: 0.0684 (0.0763) loss: 0.8271 (0.8278) time: 0.2696 data: 0.0002 max mem: 26157 Train: [17] [ 900/6250] eta: 0:24:50 lr: 0.000120 grad: 0.0688 (0.0759) loss: 0.8291 (0.8279) time: 0.2744 data: 0.0002 max mem: 26157 Train: [17] [1000/6250] eta: 0:24:19 lr: 0.000120 grad: 0.0713 (0.0755) loss: 0.8315 (0.8279) time: 0.2728 data: 0.0002 max mem: 26157 Train: [17] [1100/6250] eta: 0:23:48 lr: 0.000120 grad: 0.0648 (0.0752) loss: 0.8320 (0.8279) time: 0.2693 data: 0.0002 max mem: 26157 Train: [17] [1200/6250] eta: 0:23:17 lr: 0.000120 grad: 0.0664 (0.0750) loss: 0.8307 (0.8278) time: 0.2705 data: 0.0002 max mem: 26157 Train: [17] [1300/6250] eta: 0:22:47 lr: 0.000120 grad: 0.0668 (0.0747) loss: 0.8336 (0.8279) time: 0.2694 data: 0.0002 max mem: 26157 Train: [17] [1400/6250] eta: 0:22:17 lr: 0.000120 grad: 0.0701 (0.0745) loss: 0.8286 (0.8279) time: 0.2694 data: 0.0002 max mem: 26157 Train: [17] [1500/6250] eta: 0:22:03 lr: 0.000120 grad: 0.0722 (0.0743) loss: 0.8255 (0.8279) time: 0.2685 data: 0.0002 max mem: 26157 Train: [17] [1600/6250] eta: 0:21:33 lr: 0.000120 grad: 0.0685 (0.0742) loss: 0.8213 (0.8279) time: 0.2686 data: 0.0002 max mem: 26157 Train: [17] [1700/6250] eta: 0:21:02 lr: 0.000120 grad: 0.0744 (0.0741) loss: 0.8286 (0.8280) time: 0.2683 data: 0.0002 max mem: 26157 Train: [17] [1800/6250] eta: 0:20:33 lr: 0.000120 grad: 0.0759 (0.0740) loss: 0.8227 (0.8279) time: 0.2705 data: 0.0002 max mem: 26157 Train: [17] [1900/6250] eta: 0:20:03 lr: 0.000120 grad: 0.0661 (0.0739) loss: 0.8342 (0.8280) time: 0.2716 data: 0.0002 max mem: 26157 Train: [17] [2000/6250] eta: 0:19:34 lr: 0.000120 grad: 0.0725 (0.0738) loss: 0.8272 (0.8281) time: 0.2690 data: 0.0002 max mem: 26157 Train: [17] [2100/6250] eta: 0:19:06 lr: 0.000120 grad: 0.0733 (0.0738) loss: 0.8307 (0.8282) time: 0.2685 data: 0.0002 max mem: 26157 Train: [17] [2200/6250] eta: 0:18:37 lr: 0.000120 grad: 0.0716 (0.0737) loss: 0.8280 (0.8282) time: 0.2685 data: 0.0002 max mem: 26157 Train: [17] [2300/6250] eta: 0:18:08 lr: 0.000120 grad: 0.0656 (0.0735) loss: 0.8364 (0.8284) time: 0.2708 data: 0.0002 max mem: 26157 Train: [17] [2400/6250] eta: 0:17:40 lr: 0.000120 grad: 0.0694 (0.0734) loss: 0.8321 (0.8286) time: 0.2691 data: 0.0002 max mem: 26157 Train: [17] [2500/6250] eta: 0:17:11 lr: 0.000120 grad: 0.0697 (0.0733) loss: 0.8335 (0.8287) time: 0.2695 data: 0.0002 max mem: 26157 Train: [17] [2600/6250] eta: 0:16:43 lr: 0.000120 grad: 0.0750 (0.0732) loss: 0.8286 (0.8288) time: 0.2691 data: 0.0002 max mem: 26157 Train: [17] [2700/6250] eta: 0:16:15 lr: 0.000120 grad: 0.0653 (0.0732) loss: 0.8316 (0.8289) time: 0.2684 data: 0.0002 max mem: 26157 Train: [17] [2800/6250] eta: 0:15:46 lr: 0.000120 grad: 0.0670 (0.0730) loss: 0.8321 (0.8290) time: 0.2689 data: 0.0002 max mem: 26157 Train: [17] [2900/6250] eta: 0:15:18 lr: 0.000120 grad: 0.0683 (0.0728) loss: 0.8287 (0.8291) time: 0.2675 data: 0.0002 max mem: 26157 Train: [17] [3000/6250] eta: 0:14:50 lr: 0.000120 grad: 0.0694 (0.0728) loss: 0.8249 (0.8291) time: 0.2691 data: 0.0001 max mem: 26157 Train: [17] [3100/6250] eta: 0:14:23 lr: 0.000120 grad: 0.0703 (0.0727) loss: 0.8350 (0.8292) time: 0.2701 data: 0.0002 max mem: 26157 Train: [17] [3200/6250] eta: 0:14:02 lr: 0.000120 grad: 0.0669 (0.0727) loss: 0.8346 (0.8293) time: 0.2745 data: 0.0002 max mem: 26157 Train: [17] [3300/6250] eta: 0:13:34 lr: 0.000120 grad: 0.0717 (0.0726) loss: 0.8320 (0.8293) time: 0.2726 data: 0.0002 max mem: 26157 Train: [17] [3400/6250] eta: 0:13:12 lr: 0.000120 grad: 0.0697 (0.0727) loss: 0.8290 (0.8293) time: 0.2707 data: 0.0003 max mem: 26157 Train: [17] [3500/6250] eta: 0:12:43 lr: 0.000120 grad: 0.0676 (0.0726) loss: 0.8358 (0.8294) time: 0.2677 data: 0.0002 max mem: 26157 Train: [17] [3600/6250] eta: 0:12:15 lr: 0.000120 grad: 0.0723 (0.0727) loss: 0.8321 (0.8295) time: 0.2702 data: 0.0002 max mem: 26157 Train: [17] [3700/6250] eta: 0:11:47 lr: 0.000120 grad: 0.0731 (0.0727) loss: 0.8291 (0.8295) time: 0.2694 data: 0.0002 max mem: 26157 Train: [17] [3800/6250] eta: 0:11:18 lr: 0.000120 grad: 0.0682 (0.0727) loss: 0.8304 (0.8295) time: 0.2685 data: 0.0002 max mem: 26157 Train: [17] [3900/6250] eta: 0:10:50 lr: 0.000120 grad: 0.0671 (0.0726) loss: 0.8288 (0.8295) time: 0.2681 data: 0.0002 max mem: 26157 Train: [17] [4000/6250] eta: 0:10:22 lr: 0.000120 grad: 0.0690 (0.0726) loss: 0.8297 (0.8295) time: 0.2684 data: 0.0002 max mem: 26157 Train: [17] [4100/6250] eta: 0:09:54 lr: 0.000120 grad: 0.0719 (0.0726) loss: 0.8241 (0.8295) time: 0.2690 data: 0.0002 max mem: 26157 Train: [17] [4200/6250] eta: 0:09:26 lr: 0.000120 grad: 0.0714 (0.0727) loss: 0.8263 (0.8295) time: 0.2699 data: 0.0003 max mem: 26157 Train: [17] [4300/6250] eta: 0:08:58 lr: 0.000120 grad: 0.0656 (0.0726) loss: 0.8385 (0.8296) time: 0.2698 data: 0.0002 max mem: 26157 Train: [17] [4400/6250] eta: 0:08:31 lr: 0.000120 grad: 0.0705 (0.0727) loss: 0.8306 (0.8296) time: 0.2691 data: 0.0002 max mem: 26157 Train: [17] [4500/6250] eta: 0:08:03 lr: 0.000120 grad: 0.0733 (0.0727) loss: 0.8277 (0.8296) time: 0.2691 data: 0.0002 max mem: 26157 Train: [17] [4600/6250] eta: 0:07:35 lr: 0.000120 grad: 0.0735 (0.0727) loss: 0.8292 (0.8297) time: 0.2704 data: 0.0002 max mem: 26157 Train: [17] [4700/6250] eta: 0:07:07 lr: 0.000120 grad: 0.0698 (0.0727) loss: 0.8313 (0.8296) time: 0.2689 data: 0.0002 max mem: 26157 Train: [17] [4800/6250] eta: 0:06:39 lr: 0.000120 grad: 0.0706 (0.0728) loss: 0.8297 (0.8297) time: 0.2699 data: 0.0002 max mem: 26157 Train: [17] [4900/6250] eta: 0:06:11 lr: 0.000119 grad: 0.0739 (0.0729) loss: 0.8319 (0.8297) time: 0.2682 data: 0.0001 max mem: 26157 Train: [17] [5000/6250] eta: 0:05:44 lr: 0.000119 grad: 0.0742 (0.0729) loss: 0.8227 (0.8297) time: 0.2693 data: 0.0002 max mem: 26157 Train: [17] [5100/6250] eta: 0:05:16 lr: 0.000119 grad: 0.0756 (0.0730) loss: 0.8262 (0.8297) time: 0.2703 data: 0.0002 max mem: 26157 Train: [17] [5200/6250] eta: 0:04:49 lr: 0.000119 grad: 0.0723 (0.0731) loss: 0.8240 (0.8296) time: 0.3048 data: 0.0348 max mem: 26157 Train: [17] [5300/6250] eta: 0:04:22 lr: 0.000119 grad: 0.0769 (0.0732) loss: 0.8285 (0.8296) time: 0.2689 data: 0.0001 max mem: 26157 Train: [17] [5400/6250] eta: 0:03:54 lr: 0.000119 grad: 0.0797 (0.0733) loss: 0.8227 (0.8296) time: 0.2692 data: 0.0002 max mem: 26157 Train: [17] [5500/6250] eta: 0:03:26 lr: 0.000119 grad: 0.0755 (0.0733) loss: 0.8267 (0.8295) time: 0.2706 data: 0.0002 max mem: 26157 Train: [17] [5600/6250] eta: 0:02:59 lr: 0.000119 grad: 0.0763 (0.0733) loss: 0.8296 (0.8295) time: 0.2707 data: 0.0002 max mem: 26157 Train: [17] [5700/6250] eta: 0:02:31 lr: 0.000119 grad: 0.0759 (0.0734) loss: 0.8253 (0.8294) time: 0.2711 data: 0.0002 max mem: 26157 Train: [17] [5800/6250] eta: 0:02:04 lr: 0.000119 grad: 0.0754 (0.0734) loss: 0.8256 (0.8293) time: 0.2726 data: 0.0003 max mem: 26157 Train: [17] [5900/6250] eta: 0:01:36 lr: 0.000119 grad: 0.0762 (0.0735) loss: 0.8249 (0.8293) time: 0.2693 data: 0.0002 max mem: 26157 Train: [17] [6000/6250] eta: 0:01:08 lr: 0.000119 grad: 0.0754 (0.0735) loss: 0.8263 (0.8292) time: 0.2695 data: 0.0002 max mem: 26157 Train: [17] [6100/6250] eta: 0:00:41 lr: 0.000119 grad: 0.0739 (0.0736) loss: 0.8205 (0.8291) time: 0.2701 data: 0.0002 max mem: 26157 Train: [17] [6200/6250] eta: 0:00:13 lr: 0.000119 grad: 0.0726 (0.0737) loss: 0.8273 (0.8290) time: 0.2716 data: 0.0002 max mem: 26157 Train: [17] [6249/6250] eta: 0:00:00 lr: 0.000119 grad: 0.0726 (0.0737) loss: 0.8253 (0.8290) time: 0.2710 data: 0.0002 max mem: 26157 Train: [17] Total time: 0:28:55 (0.2777 s / it) Averaged stats: lr: 0.000119 grad: 0.0726 (0.0737) loss: 0.8253 (0.8290) Eval (hcp-train-subset): [17] [ 0/62] eta: 0:05:19 loss: 0.8488 (0.8488) time: 5.1472 data: 5.0617 max mem: 26157 Eval (hcp-train-subset): [17] [61/62] eta: 0:00:00 loss: 0.8359 (0.8387) time: 0.1338 data: 0.0510 max mem: 26157 Eval (hcp-train-subset): [17] Total time: 0:00:13 (0.2104 s / it) Averaged stats (hcp-train-subset): loss: 0.8359 (0.8387) Making plots (hcp-train-subset): example=3 Eval (hcp-val): [17] [ 0/62] eta: 0:03:20 loss: 0.8339 (0.8339) time: 3.2295 data: 3.1248 max mem: 26157 Eval (hcp-val): [17] [61/62] eta: 0:00:00 loss: 0.8329 (0.8352) time: 0.0829 data: 0.0002 max mem: 26157 Eval (hcp-val): [17] Total time: 0:00:17 (0.2872 s / it) Averaged stats (hcp-val): loss: 0.8329 (0.8352) Making plots (hcp-val): example=38 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [18] [ 0/6250] eta: 9:22:26 lr: 0.000119 grad: 0.0795 (0.0795) loss: 0.8629 (0.8629) time: 5.3995 data: 5.1095 max mem: 26157 Train: [18] [ 100/6250] eta: 0:34:11 lr: 0.000119 grad: 0.0775 (0.0825) loss: 0.8279 (0.8325) time: 0.2709 data: 0.0002 max mem: 26157 Train: [18] [ 200/6250] eta: 0:30:29 lr: 0.000119 grad: 0.0745 (0.0792) loss: 0.8252 (0.8302) time: 0.2715 data: 0.0002 max mem: 26157 Train: [18] [ 300/6250] eta: 0:28:55 lr: 0.000119 grad: 0.0714 (0.0772) loss: 0.8288 (0.8300) time: 0.2697 data: 0.0002 max mem: 26157 Train: [18] [ 400/6250] eta: 0:27:56 lr: 0.000119 grad: 0.0706 (0.0763) loss: 0.8320 (0.8304) time: 0.2734 data: 0.0002 max mem: 26157 Train: [18] [ 500/6250] eta: 0:27:10 lr: 0.000119 grad: 0.0710 (0.0755) loss: 0.8309 (0.8305) time: 0.2677 data: 0.0001 max mem: 26157 Train: [18] [ 600/6250] eta: 0:26:30 lr: 0.000119 grad: 0.0677 (0.0747) loss: 0.8277 (0.8304) time: 0.2716 data: 0.0002 max mem: 26157 Train: [18] [ 700/6250] eta: 0:25:53 lr: 0.000119 grad: 0.0697 (0.0744) loss: 0.8281 (0.8302) time: 0.2714 data: 0.0002 max mem: 26157 Train: [18] [ 800/6250] eta: 0:25:18 lr: 0.000119 grad: 0.0667 (0.0741) loss: 0.8256 (0.8298) time: 0.2703 data: 0.0002 max mem: 26157 Train: [18] [ 900/6250] eta: 0:24:45 lr: 0.000119 grad: 0.0691 (0.0737) loss: 0.8272 (0.8298) time: 0.2712 data: 0.0003 max mem: 26157 Train: [18] [1000/6250] eta: 0:24:14 lr: 0.000119 grad: 0.0719 (0.0735) loss: 0.8304 (0.8297) time: 0.2688 data: 0.0002 max mem: 26157 Train: [18] [1100/6250] eta: 0:23:43 lr: 0.000119 grad: 0.0680 (0.0734) loss: 0.8339 (0.8295) time: 0.2740 data: 0.0002 max mem: 26157 Train: [18] [1200/6250] eta: 0:23:16 lr: 0.000119 grad: 0.0727 (0.0732) loss: 0.8262 (0.8294) time: 0.2709 data: 0.0003 max mem: 26157 Train: [18] [1300/6250] eta: 0:22:54 lr: 0.000119 grad: 0.0718 (0.0732) loss: 0.8251 (0.8292) time: 0.2696 data: 0.0002 max mem: 26157 Train: [18] [1400/6250] eta: 0:22:24 lr: 0.000119 grad: 0.0727 (0.0731) loss: 0.8232 (0.8290) time: 0.2698 data: 0.0002 max mem: 26157 Train: [18] [1500/6250] eta: 0:21:54 lr: 0.000119 grad: 0.0685 (0.0731) loss: 0.8300 (0.8287) time: 0.2695 data: 0.0002 max mem: 26157 Train: [18] [1600/6250] eta: 0:21:25 lr: 0.000119 grad: 0.0683 (0.0730) loss: 0.8255 (0.8285) time: 0.2688 data: 0.0002 max mem: 26157 Train: [18] [1700/6250] eta: 0:20:56 lr: 0.000119 grad: 0.0735 (0.0729) loss: 0.8262 (0.8284) time: 0.2701 data: 0.0002 max mem: 26157 Train: [18] [1800/6250] eta: 0:20:27 lr: 0.000119 grad: 0.0728 (0.0728) loss: 0.8213 (0.8283) time: 0.2698 data: 0.0002 max mem: 26157 Train: [18] [1900/6250] eta: 0:19:58 lr: 0.000119 grad: 0.0775 (0.0730) loss: 0.8194 (0.8280) time: 0.2748 data: 0.0002 max mem: 26157 Train: [18] [2000/6250] eta: 0:19:39 lr: 0.000119 grad: 0.0756 (0.0731) loss: 0.8223 (0.8278) time: 0.3829 data: 0.1049 max mem: 26157 Train: [18] [2100/6250] eta: 0:19:10 lr: 0.000119 grad: 0.0748 (0.0732) loss: 0.8303 (0.8277) time: 0.2725 data: 0.0002 max mem: 26157 Train: [18] [2200/6250] eta: 0:18:45 lr: 0.000119 grad: 0.0750 (0.0732) loss: 0.8269 (0.8276) time: 0.2716 data: 0.0002 max mem: 26157 Train: [18] [2300/6250] eta: 0:18:16 lr: 0.000119 grad: 0.0747 (0.0733) loss: 0.8300 (0.8275) time: 0.2699 data: 0.0002 max mem: 26157 Train: [18] [2400/6250] eta: 0:17:47 lr: 0.000119 grad: 0.0739 (0.0733) loss: 0.8245 (0.8275) time: 0.2709 data: 0.0002 max mem: 26157 Train: [18] [2500/6250] eta: 0:17:18 lr: 0.000119 grad: 0.0699 (0.0732) loss: 0.8268 (0.8276) time: 0.2700 data: 0.0002 max mem: 26157 Train: [18] [2600/6250] eta: 0:16:50 lr: 0.000119 grad: 0.0660 (0.0731) loss: 0.8326 (0.8276) time: 0.2690 data: 0.0002 max mem: 26157 Train: [18] [2700/6250] eta: 0:16:21 lr: 0.000119 grad: 0.0700 (0.0731) loss: 0.8270 (0.8277) time: 0.2711 data: 0.0002 max mem: 26157 Train: [18] [2800/6250] eta: 0:15:53 lr: 0.000119 grad: 0.0674 (0.0731) loss: 0.8275 (0.8277) time: 0.2713 data: 0.0002 max mem: 26157 Train: [18] [2900/6250] eta: 0:15:25 lr: 0.000119 grad: 0.0675 (0.0730) loss: 0.8329 (0.8278) time: 0.2687 data: 0.0002 max mem: 26157 Train: [18] [3000/6250] eta: 0:14:56 lr: 0.000119 grad: 0.0737 (0.0730) loss: 0.8249 (0.8278) time: 0.2698 data: 0.0002 max mem: 26157 Train: [18] [3100/6250] eta: 0:14:28 lr: 0.000119 grad: 0.0675 (0.0729) loss: 0.8281 (0.8279) time: 0.2716 data: 0.0002 max mem: 26157 Train: [18] [3200/6250] eta: 0:14:00 lr: 0.000119 grad: 0.0702 (0.0728) loss: 0.8252 (0.8279) time: 0.2715 data: 0.0002 max mem: 26157 Train: [18] [3300/6250] eta: 0:13:32 lr: 0.000119 grad: 0.0702 (0.0727) loss: 0.8248 (0.8279) time: 0.2713 data: 0.0002 max mem: 26157 Train: [18] [3400/6250] eta: 0:13:04 lr: 0.000119 grad: 0.0677 (0.0727) loss: 0.8272 (0.8279) time: 0.2706 data: 0.0002 max mem: 26157 Train: [18] [3500/6250] eta: 0:12:36 lr: 0.000119 grad: 0.0743 (0.0728) loss: 0.8268 (0.8278) time: 0.2708 data: 0.0002 max mem: 26157 Train: [18] [3600/6250] eta: 0:12:09 lr: 0.000119 grad: 0.0715 (0.0729) loss: 0.8276 (0.8278) time: 0.2705 data: 0.0001 max mem: 26157 Train: [18] [3700/6250] eta: 0:11:41 lr: 0.000119 grad: 0.0684 (0.0729) loss: 0.8271 (0.8278) time: 0.2698 data: 0.0002 max mem: 26157 Train: [18] [3800/6250] eta: 0:11:14 lr: 0.000119 grad: 0.0787 (0.0731) loss: 0.8266 (0.8276) time: 0.2690 data: 0.0002 max mem: 26157 Train: [18] [3900/6250] eta: 0:10:46 lr: 0.000119 grad: 0.0764 (0.0732) loss: 0.8289 (0.8276) time: 0.2682 data: 0.0002 max mem: 26157 Train: [18] [4000/6250] eta: 0:10:18 lr: 0.000119 grad: 0.0770 (0.0732) loss: 0.8208 (0.8276) time: 0.2709 data: 0.0002 max mem: 26157 Train: [18] [4100/6250] eta: 0:09:50 lr: 0.000119 grad: 0.0733 (0.0733) loss: 0.8258 (0.8276) time: 0.2687 data: 0.0002 max mem: 26157 Train: [18] [4200/6250] eta: 0:09:22 lr: 0.000119 grad: 0.0739 (0.0733) loss: 0.8237 (0.8276) time: 0.2693 data: 0.0002 max mem: 26157 Train: [18] [4300/6250] eta: 0:08:55 lr: 0.000119 grad: 0.0745 (0.0734) loss: 0.8222 (0.8276) time: 0.2672 data: 0.0002 max mem: 26157 Train: [18] [4400/6250] eta: 0:08:27 lr: 0.000119 grad: 0.0724 (0.0734) loss: 0.8243 (0.8276) time: 0.2701 data: 0.0002 max mem: 26157 Train: [18] [4500/6250] eta: 0:07:59 lr: 0.000119 grad: 0.0756 (0.0734) loss: 0.8283 (0.8275) time: 0.2715 data: 0.0002 max mem: 26157 Train: [18] [4600/6250] eta: 0:07:33 lr: 0.000119 grad: 0.0711 (0.0735) loss: 0.8289 (0.8275) time: 0.2765 data: 0.0002 max mem: 26157 Train: [18] [4700/6250] eta: 0:07:05 lr: 0.000119 grad: 0.0745 (0.0735) loss: 0.8239 (0.8275) time: 0.2762 data: 0.0002 max mem: 26157 Train: [18] [4800/6250] eta: 0:06:38 lr: 0.000119 grad: 0.0700 (0.0735) loss: 0.8270 (0.8276) time: 0.2693 data: 0.0001 max mem: 26157 Train: [18] [4900/6250] eta: 0:06:10 lr: 0.000119 grad: 0.0716 (0.0736) loss: 0.8269 (0.8276) time: 0.2688 data: 0.0002 max mem: 26157 Train: [18] [5000/6250] eta: 0:05:43 lr: 0.000119 grad: 0.0686 (0.0736) loss: 0.8314 (0.8276) time: 0.2675 data: 0.0001 max mem: 26157 Train: [18] [5100/6250] eta: 0:05:15 lr: 0.000119 grad: 0.0702 (0.0735) loss: 0.8284 (0.8276) time: 0.2713 data: 0.0002 max mem: 26157 Train: [18] [5200/6250] eta: 0:04:47 lr: 0.000119 grad: 0.0782 (0.0736) loss: 0.8311 (0.8277) time: 0.2681 data: 0.0002 max mem: 26157 Train: [18] [5300/6250] eta: 0:04:20 lr: 0.000119 grad: 0.0740 (0.0736) loss: 0.8274 (0.8277) time: 0.2681 data: 0.0002 max mem: 26157 Train: [18] [5400/6250] eta: 0:03:52 lr: 0.000119 grad: 0.0721 (0.0736) loss: 0.8299 (0.8277) time: 0.2679 data: 0.0002 max mem: 26157 Train: [18] [5500/6250] eta: 0:03:25 lr: 0.000119 grad: 0.0767 (0.0737) loss: 0.8243 (0.8276) time: 0.2692 data: 0.0002 max mem: 26157 Train: [18] [5600/6250] eta: 0:02:58 lr: 0.000119 grad: 0.0714 (0.0737) loss: 0.8281 (0.8276) time: 0.2691 data: 0.0002 max mem: 26157 Train: [18] [5700/6250] eta: 0:02:30 lr: 0.000119 grad: 0.0708 (0.0738) loss: 0.8236 (0.8276) time: 0.2695 data: 0.0002 max mem: 26157 Train: [18] [5800/6250] eta: 0:02:03 lr: 0.000118 grad: 0.0735 (0.0738) loss: 0.8277 (0.8276) time: 0.2699 data: 0.0002 max mem: 26157 Train: [18] [5900/6250] eta: 0:01:35 lr: 0.000118 grad: 0.0747 (0.0738) loss: 0.8276 (0.8275) time: 0.2685 data: 0.0002 max mem: 26157 Train: [18] [6000/6250] eta: 0:01:08 lr: 0.000118 grad: 0.0775 (0.0739) loss: 0.8194 (0.8275) time: 0.2677 data: 0.0002 max mem: 26157 Train: [18] [6100/6250] eta: 0:00:41 lr: 0.000118 grad: 0.0746 (0.0739) loss: 0.8250 (0.8274) time: 0.2696 data: 0.0002 max mem: 26157 Train: [18] [6200/6250] eta: 0:00:13 lr: 0.000118 grad: 0.0739 (0.0739) loss: 0.8265 (0.8274) time: 0.2704 data: 0.0002 max mem: 26157 Train: [18] [6249/6250] eta: 0:00:00 lr: 0.000118 grad: 0.0728 (0.0739) loss: 0.8289 (0.8274) time: 0.2694 data: 0.0001 max mem: 26157 Train: [18] Total time: 0:28:36 (0.2746 s / it) Averaged stats: lr: 0.000118 grad: 0.0728 (0.0739) loss: 0.8289 (0.8274) Eval (hcp-train-subset): [18] [ 0/62] eta: 0:05:58 loss: 0.8508 (0.8508) time: 5.7747 data: 5.6897 max mem: 26157 Eval (hcp-train-subset): [18] [61/62] eta: 0:00:00 loss: 0.8359 (0.8387) time: 0.1437 data: 0.0610 max mem: 26157 Eval (hcp-train-subset): [18] Total time: 0:00:14 (0.2413 s / it) Averaged stats (hcp-train-subset): loss: 0.8359 (0.8387) Making plots (hcp-train-subset): example=2 Eval (hcp-val): [18] [ 0/62] eta: 0:04:56 loss: 0.8324 (0.8324) time: 4.7857 data: 4.6945 max mem: 26157 Eval (hcp-val): [18] [61/62] eta: 0:00:00 loss: 0.8323 (0.8342) time: 0.1583 data: 0.0751 max mem: 26157 Eval (hcp-val): [18] Total time: 0:00:14 (0.2357 s / it) Averaged stats (hcp-val): loss: 0.8323 (0.8342) Making plots (hcp-val): example=25 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [19] [ 0/6250] eta: 12:09:00 lr: 0.000118 grad: 0.1213 (0.1213) loss: 0.8194 (0.8194) time: 6.9985 data: 6.7215 max mem: 26157 Train: [19] [ 100/6250] eta: 0:34:42 lr: 0.000118 grad: 0.0772 (0.0833) loss: 0.8387 (0.8401) time: 0.2723 data: 0.0002 max mem: 26157 Train: [19] [ 200/6250] eta: 0:30:42 lr: 0.000118 grad: 0.0718 (0.0796) loss: 0.8327 (0.8369) time: 0.2699 data: 0.0002 max mem: 26157 Train: [19] [ 300/6250] eta: 0:29:03 lr: 0.000118 grad: 0.0658 (0.0783) loss: 0.8358 (0.8350) time: 0.2694 data: 0.0002 max mem: 26157 Train: [19] [ 400/6250] eta: 0:28:19 lr: 0.000118 grad: 0.0709 (0.0786) loss: 0.8301 (0.8336) time: 0.3333 data: 0.0002 max mem: 26157 Train: [19] [ 500/6250] eta: 0:27:28 lr: 0.000118 grad: 0.0713 (0.0778) loss: 0.8265 (0.8325) time: 0.2722 data: 0.0002 max mem: 26157 Train: [19] [ 600/6250] eta: 0:26:44 lr: 0.000118 grad: 0.0708 (0.0768) loss: 0.8275 (0.8319) time: 0.2700 data: 0.0002 max mem: 26157 Train: [19] [ 700/6250] eta: 0:26:05 lr: 0.000118 grad: 0.0720 (0.0763) loss: 0.8312 (0.8317) time: 0.2698 data: 0.0002 max mem: 26157 Train: [19] [ 800/6250] eta: 0:25:29 lr: 0.000118 grad: 0.0736 (0.0759) loss: 0.8360 (0.8317) time: 0.2768 data: 0.0002 max mem: 26157 Train: [19] [ 900/6250] eta: 0:24:59 lr: 0.000118 grad: 0.0648 (0.0753) loss: 0.8321 (0.8318) time: 0.2725 data: 0.0002 max mem: 26157 Train: [19] [1000/6250] eta: 0:24:27 lr: 0.000118 grad: 0.0651 (0.0748) loss: 0.8288 (0.8318) time: 0.2705 data: 0.0002 max mem: 26157 Train: [19] [1100/6250] eta: 0:23:56 lr: 0.000118 grad: 0.0703 (0.0746) loss: 0.8286 (0.8317) time: 0.2698 data: 0.0002 max mem: 26157 Train: [19] [1200/6250] eta: 0:23:25 lr: 0.000118 grad: 0.0746 (0.0745) loss: 0.8264 (0.8314) time: 0.2686 data: 0.0002 max mem: 26157 Train: [19] [1300/6250] eta: 0:22:53 lr: 0.000118 grad: 0.0735 (0.0744) loss: 0.8243 (0.8311) time: 0.2688 data: 0.0002 max mem: 26157 Train: [19] [1400/6250] eta: 0:22:23 lr: 0.000118 grad: 0.0722 (0.0744) loss: 0.8246 (0.8308) time: 0.2729 data: 0.0002 max mem: 26157 Train: [19] [1500/6250] eta: 0:21:52 lr: 0.000118 grad: 0.0761 (0.0745) loss: 0.8229 (0.8305) time: 0.2687 data: 0.0002 max mem: 26157 Train: [19] [1600/6250] eta: 0:21:22 lr: 0.000118 grad: 0.0712 (0.0745) loss: 0.8311 (0.8303) time: 0.2689 data: 0.0002 max mem: 26157 Train: [19] [1700/6250] eta: 0:20:53 lr: 0.000118 grad: 0.0687 (0.0745) loss: 0.8267 (0.8303) time: 0.2695 data: 0.0002 max mem: 26157 Train: [19] [1800/6250] eta: 0:20:24 lr: 0.000118 grad: 0.0741 (0.0745) loss: 0.8281 (0.8301) time: 0.2688 data: 0.0002 max mem: 26157 Train: [19] [1900/6250] eta: 0:19:55 lr: 0.000118 grad: 0.0782 (0.0747) loss: 0.8217 (0.8298) time: 0.2692 data: 0.0002 max mem: 26157 Train: [19] [2000/6250] eta: 0:19:27 lr: 0.000118 grad: 0.0737 (0.0749) loss: 0.8305 (0.8297) time: 0.2687 data: 0.0002 max mem: 26157 Train: [19] [2100/6250] eta: 0:18:58 lr: 0.000118 grad: 0.0720 (0.0749) loss: 0.8267 (0.8296) time: 0.2699 data: 0.0002 max mem: 26157 Train: [19] [2200/6250] eta: 0:18:30 lr: 0.000118 grad: 0.0757 (0.0750) loss: 0.8232 (0.8295) time: 0.2703 data: 0.0002 max mem: 26157 Train: [19] [2300/6250] eta: 0:18:02 lr: 0.000118 grad: 0.0735 (0.0750) loss: 0.8248 (0.8293) time: 0.2697 data: 0.0002 max mem: 26157 Train: [19] [2400/6250] eta: 0:17:34 lr: 0.000118 grad: 0.0748 (0.0751) loss: 0.8240 (0.8291) time: 0.2680 data: 0.0002 max mem: 26157 Train: [19] [2500/6250] eta: 0:17:06 lr: 0.000118 grad: 0.0731 (0.0753) loss: 0.8177 (0.8288) time: 0.2683 data: 0.0001 max mem: 26157 Train: [19] [2600/6250] eta: 0:16:38 lr: 0.000118 grad: 0.0752 (0.0754) loss: 0.8189 (0.8286) time: 0.2713 data: 0.0002 max mem: 26157 Train: [19] [2700/6250] eta: 0:16:10 lr: 0.000118 grad: 0.0718 (0.0754) loss: 0.8255 (0.8284) time: 0.2683 data: 0.0002 max mem: 26157 Train: [19] [2800/6250] eta: 0:15:42 lr: 0.000118 grad: 0.0746 (0.0755) loss: 0.8204 (0.8283) time: 0.2685 data: 0.0002 max mem: 26157 Train: [19] [2900/6250] eta: 0:15:15 lr: 0.000118 grad: 0.0773 (0.0755) loss: 0.8233 (0.8281) time: 0.2702 data: 0.0003 max mem: 26157 Train: [19] [3000/6250] eta: 0:14:47 lr: 0.000118 grad: 0.0739 (0.0755) loss: 0.8280 (0.8279) time: 0.2690 data: 0.0002 max mem: 26157 Train: [19] [3100/6250] eta: 0:14:19 lr: 0.000118 grad: 0.0736 (0.0756) loss: 0.8311 (0.8279) time: 0.2687 data: 0.0002 max mem: 26157 Train: [19] [3200/6250] eta: 0:13:52 lr: 0.000118 grad: 0.0756 (0.0756) loss: 0.8252 (0.8277) time: 0.2701 data: 0.0002 max mem: 26157 Train: [19] [3300/6250] eta: 0:13:26 lr: 0.000118 grad: 0.0754 (0.0756) loss: 0.8218 (0.8277) time: 0.2684 data: 0.0002 max mem: 26157 Train: [19] [3400/6250] eta: 0:12:58 lr: 0.000118 grad: 0.0701 (0.0755) loss: 0.8263 (0.8276) time: 0.2687 data: 0.0002 max mem: 26157 Train: [19] [3500/6250] eta: 0:12:31 lr: 0.000118 grad: 0.0736 (0.0755) loss: 0.8268 (0.8275) time: 0.2706 data: 0.0002 max mem: 26157 Train: [19] [3600/6250] eta: 0:12:03 lr: 0.000118 grad: 0.0693 (0.0755) loss: 0.8287 (0.8275) time: 0.2688 data: 0.0002 max mem: 26157 Train: [19] [3700/6250] eta: 0:11:35 lr: 0.000118 grad: 0.0736 (0.0754) loss: 0.8180 (0.8274) time: 0.2708 data: 0.0002 max mem: 26157 Train: [19] [3800/6250] eta: 0:11:08 lr: 0.000118 grad: 0.0745 (0.0755) loss: 0.8292 (0.8274) time: 0.2692 data: 0.0002 max mem: 26157 Train: [19] [3900/6250] eta: 0:10:41 lr: 0.000118 grad: 0.0716 (0.0754) loss: 0.8244 (0.8273) time: 0.2706 data: 0.0002 max mem: 26157 Train: [19] [4000/6250] eta: 0:10:13 lr: 0.000118 grad: 0.0757 (0.0756) loss: 0.8267 (0.8272) time: 0.2695 data: 0.0002 max mem: 26157 Train: [19] [4100/6250] eta: 0:09:46 lr: 0.000118 grad: 0.0797 (0.0757) loss: 0.8253 (0.8271) time: 0.2693 data: 0.0002 max mem: 26157 Train: [19] [4200/6250] eta: 0:09:18 lr: 0.000118 grad: 0.0768 (0.0759) loss: 0.8252 (0.8271) time: 0.2686 data: 0.0002 max mem: 26157 Train: [19] [4300/6250] eta: 0:08:51 lr: 0.000118 grad: 0.0709 (0.0759) loss: 0.8280 (0.8270) time: 0.2692 data: 0.0002 max mem: 26157 Train: [19] [4400/6250] eta: 0:08:23 lr: 0.000118 grad: 0.0756 (0.0760) loss: 0.8199 (0.8270) time: 0.2687 data: 0.0002 max mem: 26157 Train: [19] [4500/6250] eta: 0:07:56 lr: 0.000118 grad: 0.0708 (0.0760) loss: 0.8219 (0.8269) time: 0.2700 data: 0.0002 max mem: 26157 Train: [19] [4600/6250] eta: 0:07:29 lr: 0.000118 grad: 0.0771 (0.0761) loss: 0.8249 (0.8269) time: 0.2731 data: 0.0002 max mem: 26157 Train: [19] [4700/6250] eta: 0:07:02 lr: 0.000118 grad: 0.0748 (0.0761) loss: 0.8235 (0.8268) time: 0.2727 data: 0.0002 max mem: 26157 Train: [19] [4800/6250] eta: 0:06:34 lr: 0.000118 grad: 0.0789 (0.0761) loss: 0.8197 (0.8267) time: 0.2760 data: 0.0002 max mem: 26157 Train: [19] [4900/6250] eta: 0:06:09 lr: 0.000118 grad: 0.0739 (0.0762) loss: 0.8180 (0.8267) time: 0.2689 data: 0.0002 max mem: 26157 Train: [19] [5000/6250] eta: 0:05:41 lr: 0.000118 grad: 0.0747 (0.0763) loss: 0.8209 (0.8266) time: 0.2718 data: 0.0002 max mem: 26157 Train: [19] [5100/6250] eta: 0:05:14 lr: 0.000118 grad: 0.0718 (0.0763) loss: 0.8223 (0.8266) time: 0.2742 data: 0.0002 max mem: 26157 Train: [19] [5200/6250] eta: 0:04:47 lr: 0.000118 grad: 0.0763 (0.0762) loss: 0.8233 (0.8265) time: 0.2707 data: 0.0002 max mem: 26157 Train: [19] [5300/6250] eta: 0:04:19 lr: 0.000118 grad: 0.0739 (0.0762) loss: 0.8227 (0.8265) time: 0.2701 data: 0.0002 max mem: 26157 Train: [19] [5400/6250] eta: 0:03:52 lr: 0.000118 grad: 0.0716 (0.0762) loss: 0.8223 (0.8265) time: 0.2694 data: 0.0002 max mem: 26157 Train: [19] [5500/6250] eta: 0:03:25 lr: 0.000118 grad: 0.0752 (0.0762) loss: 0.8256 (0.8264) time: 0.2690 data: 0.0002 max mem: 26157 Train: [19] [5600/6250] eta: 0:02:57 lr: 0.000118 grad: 0.0783 (0.0762) loss: 0.8230 (0.8264) time: 0.2686 data: 0.0002 max mem: 26157 Train: [19] [5700/6250] eta: 0:02:30 lr: 0.000118 grad: 0.0754 (0.0762) loss: 0.8231 (0.8264) time: 0.2682 data: 0.0002 max mem: 26157 Train: [19] [5800/6250] eta: 0:02:02 lr: 0.000118 grad: 0.0736 (0.0762) loss: 0.8261 (0.8264) time: 0.2670 data: 0.0002 max mem: 26157 Train: [19] [5900/6250] eta: 0:01:35 lr: 0.000118 grad: 0.0727 (0.0762) loss: 0.8267 (0.8264) time: 0.2713 data: 0.0002 max mem: 26157 Train: [19] [6000/6250] eta: 0:01:08 lr: 0.000118 grad: 0.0760 (0.0762) loss: 0.8227 (0.8264) time: 0.2716 data: 0.0002 max mem: 26157 Train: [19] [6100/6250] eta: 0:00:41 lr: 0.000117 grad: 0.0723 (0.0761) loss: 0.8297 (0.8264) time: 0.2728 data: 0.0002 max mem: 26157 Train: [19] [6200/6250] eta: 0:00:13 lr: 0.000117 grad: 0.0786 (0.0762) loss: 0.8294 (0.8263) time: 0.2710 data: 0.0002 max mem: 26157 Train: [19] [6249/6250] eta: 0:00:00 lr: 0.000117 grad: 0.0793 (0.0762) loss: 0.8290 (0.8263) time: 0.2702 data: 0.0002 max mem: 26157 Train: [19] Total time: 0:28:38 (0.2749 s / it) Averaged stats: lr: 0.000117 grad: 0.0793 (0.0762) loss: 0.8290 (0.8263) Eval (hcp-train-subset): [19] [ 0/62] eta: 0:04:24 loss: 0.8451 (0.8451) time: 4.2592 data: 4.1388 max mem: 26157 Eval (hcp-train-subset): [19] [61/62] eta: 0:00:00 loss: 0.8352 (0.8380) time: 0.1435 data: 0.0607 max mem: 26157 Eval (hcp-train-subset): [19] Total time: 0:00:13 (0.2134 s / it) Averaged stats (hcp-train-subset): loss: 0.8352 (0.8380) Making plots (hcp-train-subset): example=18 Eval (hcp-val): [19] [ 0/62] eta: 0:05:35 loss: 0.8316 (0.8316) time: 5.4181 data: 5.3340 max mem: 26157 Eval (hcp-val): [19] [61/62] eta: 0:00:00 loss: 0.8323 (0.8337) time: 0.1216 data: 0.0384 max mem: 26157 Eval (hcp-val): [19] Total time: 0:00:12 (0.2043 s / it) Averaged stats (hcp-val): loss: 0.8323 (0.8337) Making plots (hcp-val): example=18 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [20] [ 0/6250] eta: 6:28:51 lr: 0.000117 grad: 0.0507 (0.0507) loss: 0.8646 (0.8646) time: 3.7330 data: 3.3768 max mem: 26157 Train: [20] [ 100/6250] eta: 0:33:47 lr: 0.000117 grad: 0.0806 (0.0893) loss: 0.8312 (0.8342) time: 0.2690 data: 0.0002 max mem: 26157 Train: [20] [ 200/6250] eta: 0:30:15 lr: 0.000117 grad: 0.0792 (0.0852) loss: 0.8215 (0.8316) time: 0.2717 data: 0.0002 max mem: 26157 Train: [20] [ 300/6250] eta: 0:28:44 lr: 0.000117 grad: 0.0688 (0.0837) loss: 0.8266 (0.8276) time: 0.2686 data: 0.0002 max mem: 26157 Train: [20] [ 400/6250] eta: 0:27:44 lr: 0.000117 grad: 0.0753 (0.0821) loss: 0.8251 (0.8266) time: 0.2698 data: 0.0002 max mem: 26157 Train: [20] [ 500/6250] eta: 0:26:59 lr: 0.000117 grad: 0.0783 (0.0813) loss: 0.8232 (0.8261) time: 0.2699 data: 0.0002 max mem: 26157 Train: [20] [ 600/6250] eta: 0:26:20 lr: 0.000117 grad: 0.0723 (0.0807) loss: 0.8293 (0.8260) time: 0.2707 data: 0.0002 max mem: 26157 Train: [20] [ 700/6250] eta: 0:25:44 lr: 0.000117 grad: 0.0699 (0.0796) loss: 0.8267 (0.8263) time: 0.2691 data: 0.0002 max mem: 26157 Train: [20] [ 800/6250] eta: 0:25:10 lr: 0.000117 grad: 0.0698 (0.0791) loss: 0.8284 (0.8262) time: 0.2712 data: 0.0002 max mem: 26157 Train: [20] [ 900/6250] eta: 0:24:38 lr: 0.000117 grad: 0.0747 (0.0785) loss: 0.8234 (0.8260) time: 0.2689 data: 0.0002 max mem: 26157 Train: [20] [1000/6250] eta: 0:24:07 lr: 0.000117 grad: 0.0679 (0.0778) loss: 0.8256 (0.8258) time: 0.2701 data: 0.0002 max mem: 26157 Train: [20] [1100/6250] eta: 0:23:36 lr: 0.000117 grad: 0.0660 (0.0780) loss: 0.8328 (0.8259) time: 0.2693 data: 0.0002 max mem: 26157 Train: [20] [1200/6250] eta: 0:23:08 lr: 0.000117 grad: 0.0746 (0.0778) loss: 0.8245 (0.8257) time: 0.2719 data: 0.0002 max mem: 26157 Train: [20] [1300/6250] eta: 0:22:39 lr: 0.000117 grad: 0.0719 (0.0775) loss: 0.8262 (0.8257) time: 0.2698 data: 0.0002 max mem: 26157 Train: [20] [1400/6250] eta: 0:22:09 lr: 0.000117 grad: 0.0729 (0.0772) loss: 0.8299 (0.8257) time: 0.2690 data: 0.0002 max mem: 26157 Train: [20] [1500/6250] eta: 0:21:41 lr: 0.000117 grad: 0.0711 (0.0770) loss: 0.8272 (0.8255) time: 0.2709 data: 0.0002 max mem: 26157 Train: [20] [1600/6250] eta: 0:21:13 lr: 0.000117 grad: 0.0760 (0.0769) loss: 0.8302 (0.8256) time: 0.2690 data: 0.0002 max mem: 26157 Train: [20] [1700/6250] eta: 0:20:44 lr: 0.000117 grad: 0.0732 (0.0768) loss: 0.8248 (0.8256) time: 0.2718 data: 0.0002 max mem: 26157 Train: [20] [1800/6250] eta: 0:20:16 lr: 0.000117 grad: 0.0731 (0.0767) loss: 0.8251 (0.8255) time: 0.2689 data: 0.0002 max mem: 26157 Train: [20] [1900/6250] eta: 0:19:47 lr: 0.000117 grad: 0.0750 (0.0766) loss: 0.8234 (0.8255) time: 0.2700 data: 0.0002 max mem: 26157 Train: [20] [2000/6250] eta: 0:19:19 lr: 0.000117 grad: 0.0748 (0.0765) loss: 0.8186 (0.8253) time: 0.2692 data: 0.0002 max mem: 26157 Train: [20] [2100/6250] eta: 0:18:56 lr: 0.000117 grad: 0.0724 (0.0764) loss: 0.8186 (0.8252) time: 0.2687 data: 0.0002 max mem: 26157 Train: [20] [2200/6250] eta: 0:18:28 lr: 0.000117 grad: 0.0731 (0.0764) loss: 0.8236 (0.8251) time: 0.2686 data: 0.0002 max mem: 26157 Train: [20] [2300/6250] eta: 0:18:05 lr: 0.000117 grad: 0.0713 (0.0763) loss: 0.8227 (0.8251) time: 0.2790 data: 0.0002 max mem: 26157 Train: [20] [2400/6250] eta: 0:17:36 lr: 0.000117 grad: 0.0784 (0.0763) loss: 0.8255 (0.8252) time: 0.2693 data: 0.0002 max mem: 26157 Train: [20] [2500/6250] eta: 0:17:08 lr: 0.000117 grad: 0.0762 (0.0763) loss: 0.8196 (0.8251) time: 0.2695 data: 0.0002 max mem: 26157 Train: [20] [2600/6250] eta: 0:16:40 lr: 0.000117 grad: 0.0781 (0.0764) loss: 0.8223 (0.8250) time: 0.2706 data: 0.0002 max mem: 26157 Train: [20] [2700/6250] eta: 0:16:12 lr: 0.000117 grad: 0.0763 (0.0763) loss: 0.8220 (0.8250) time: 0.2671 data: 0.0001 max mem: 26157 Train: [20] [2800/6250] eta: 0:15:44 lr: 0.000117 grad: 0.0779 (0.0763) loss: 0.8209 (0.8250) time: 0.2708 data: 0.0002 max mem: 26157 Train: [20] [2900/6250] eta: 0:15:16 lr: 0.000117 grad: 0.0728 (0.0763) loss: 0.8266 (0.8250) time: 0.2686 data: 0.0001 max mem: 26157 Train: [20] [3000/6250] eta: 0:14:48 lr: 0.000117 grad: 0.0716 (0.0764) loss: 0.8277 (0.8250) time: 0.2706 data: 0.0002 max mem: 26157 Train: [20] [3100/6250] eta: 0:14:20 lr: 0.000117 grad: 0.0741 (0.0764) loss: 0.8233 (0.8251) time: 0.2687 data: 0.0002 max mem: 26157 Train: [20] [3200/6250] eta: 0:13:53 lr: 0.000117 grad: 0.0737 (0.0764) loss: 0.8262 (0.8251) time: 0.2701 data: 0.0002 max mem: 26157 Train: [20] [3300/6250] eta: 0:13:25 lr: 0.000117 grad: 0.0743 (0.0764) loss: 0.8270 (0.8251) time: 0.2700 data: 0.0002 max mem: 26157 Train: [20] [3400/6250] eta: 0:12:58 lr: 0.000117 grad: 0.0816 (0.0765) loss: 0.8235 (0.8251) time: 0.2695 data: 0.0002 max mem: 26157 Train: [20] [3500/6250] eta: 0:12:30 lr: 0.000117 grad: 0.0737 (0.0765) loss: 0.8246 (0.8252) time: 0.2699 data: 0.0002 max mem: 26157 Train: [20] [3600/6250] eta: 0:12:03 lr: 0.000117 grad: 0.0733 (0.0765) loss: 0.8212 (0.8252) time: 0.2715 data: 0.0002 max mem: 26157 Train: [20] [3700/6250] eta: 0:11:35 lr: 0.000117 grad: 0.0729 (0.0765) loss: 0.8259 (0.8251) time: 0.2719 data: 0.0002 max mem: 26157 Train: [20] [3800/6250] eta: 0:11:08 lr: 0.000117 grad: 0.0730 (0.0766) loss: 0.8305 (0.8251) time: 0.2708 data: 0.0002 max mem: 26157 Train: [20] [3900/6250] eta: 0:10:40 lr: 0.000117 grad: 0.0829 (0.0767) loss: 0.8250 (0.8251) time: 0.2698 data: 0.0002 max mem: 26157 Train: [20] [4000/6250] eta: 0:10:14 lr: 0.000117 grad: 0.0746 (0.0767) loss: 0.8307 (0.8251) time: 0.3309 data: 0.0574 max mem: 26157 Train: [20] [4100/6250] eta: 0:09:49 lr: 0.000117 grad: 0.0755 (0.0768) loss: 0.8228 (0.8251) time: 0.5051 data: 0.2219 max mem: 26157 Train: [20] [4200/6250] eta: 0:09:21 lr: 0.000117 grad: 0.0746 (0.0768) loss: 0.8269 (0.8251) time: 0.2684 data: 0.0002 max mem: 26157 Train: [20] [4300/6250] eta: 0:08:53 lr: 0.000117 grad: 0.0837 (0.0769) loss: 0.8238 (0.8251) time: 0.2686 data: 0.0002 max mem: 26157 Train: [20] [4400/6250] eta: 0:08:26 lr: 0.000117 grad: 0.0751 (0.0769) loss: 0.8262 (0.8250) time: 0.2685 data: 0.0002 max mem: 26157 Train: [20] [4500/6250] eta: 0:08:02 lr: 0.000117 grad: 0.0805 (0.0770) loss: 0.8226 (0.8251) time: 0.2682 data: 0.0002 max mem: 26157 Train: [20] [4600/6250] eta: 0:07:34 lr: 0.000117 grad: 0.0747 (0.0771) loss: 0.8293 (0.8251) time: 0.2717 data: 0.0003 max mem: 26157 Train: [20] [4700/6250] eta: 0:07:07 lr: 0.000117 grad: 0.0796 (0.0772) loss: 0.8257 (0.8251) time: 0.2692 data: 0.0002 max mem: 26157 Train: [20] [4800/6250] eta: 0:06:39 lr: 0.000117 grad: 0.0730 (0.0772) loss: 0.8274 (0.8251) time: 0.2684 data: 0.0002 max mem: 26157 Train: [20] [4900/6250] eta: 0:06:11 lr: 0.000117 grad: 0.0736 (0.0773) loss: 0.8275 (0.8250) time: 0.2682 data: 0.0002 max mem: 26157 Train: [20] [5000/6250] eta: 0:05:43 lr: 0.000117 grad: 0.0769 (0.0773) loss: 0.8263 (0.8251) time: 0.2740 data: 0.0002 max mem: 26157 Train: [20] [5100/6250] eta: 0:05:16 lr: 0.000117 grad: 0.0856 (0.0774) loss: 0.8190 (0.8250) time: 0.2680 data: 0.0002 max mem: 26157 Train: [20] [5200/6250] eta: 0:04:48 lr: 0.000117 grad: 0.0748 (0.0774) loss: 0.8192 (0.8251) time: 0.2690 data: 0.0002 max mem: 26157 Train: [20] [5300/6250] eta: 0:04:21 lr: 0.000117 grad: 0.0719 (0.0774) loss: 0.8228 (0.8251) time: 0.2693 data: 0.0002 max mem: 26157 Train: [20] [5400/6250] eta: 0:03:53 lr: 0.000117 grad: 0.0731 (0.0774) loss: 0.8281 (0.8251) time: 0.2714 data: 0.0002 max mem: 26157 Train: [20] [5500/6250] eta: 0:03:25 lr: 0.000117 grad: 0.0727 (0.0774) loss: 0.8276 (0.8252) time: 0.2701 data: 0.0002 max mem: 26157 Train: [20] [5600/6250] eta: 0:02:58 lr: 0.000117 grad: 0.0783 (0.0775) loss: 0.8218 (0.8251) time: 0.2672 data: 0.0001 max mem: 26157 Train: [20] [5700/6250] eta: 0:02:30 lr: 0.000117 grad: 0.0743 (0.0775) loss: 0.8279 (0.8251) time: 0.2687 data: 0.0002 max mem: 26157 Train: [20] [5800/6250] eta: 0:02:03 lr: 0.000117 grad: 0.0707 (0.0775) loss: 0.8277 (0.8251) time: 0.2699 data: 0.0002 max mem: 26157 Train: [20] [5900/6250] eta: 0:01:36 lr: 0.000117 grad: 0.0777 (0.0775) loss: 0.8223 (0.8251) time: 0.2709 data: 0.0002 max mem: 26157 Train: [20] [6000/6250] eta: 0:01:08 lr: 0.000116 grad: 0.0708 (0.0775) loss: 0.8291 (0.8251) time: 0.2708 data: 0.0002 max mem: 26157 Train: [20] [6100/6250] eta: 0:00:41 lr: 0.000116 grad: 0.0766 (0.0775) loss: 0.8219 (0.8251) time: 0.2731 data: 0.0002 max mem: 26157 Train: [20] [6200/6250] eta: 0:00:13 lr: 0.000116 grad: 0.0772 (0.0776) loss: 0.8251 (0.8251) time: 0.2735 data: 0.0002 max mem: 26157 Train: [20] [6249/6250] eta: 0:00:00 lr: 0.000116 grad: 0.0736 (0.0776) loss: 0.8258 (0.8251) time: 0.2714 data: 0.0002 max mem: 26157 Train: [20] Total time: 0:28:41 (0.2754 s / it) Averaged stats: lr: 0.000116 grad: 0.0736 (0.0776) loss: 0.8258 (0.8251) Eval (hcp-train-subset): [20] [ 0/62] eta: 0:05:04 loss: 0.8491 (0.8491) time: 4.9137 data: 4.8179 max mem: 26157 Eval (hcp-train-subset): [20] [61/62] eta: 0:00:00 loss: 0.8351 (0.8378) time: 0.1372 data: 0.0541 max mem: 26157 Eval (hcp-train-subset): [20] Total time: 0:00:13 (0.2117 s / it) Averaged stats (hcp-train-subset): loss: 0.8351 (0.8378) Making plots (hcp-train-subset): example=30 Eval (hcp-val): [20] [ 0/62] eta: 0:04:19 loss: 0.8302 (0.8302) time: 4.1809 data: 4.0887 max mem: 26157 Eval (hcp-val): [20] [61/62] eta: 0:00:00 loss: 0.8317 (0.8335) time: 0.1327 data: 0.0499 max mem: 26157 Eval (hcp-val): [20] Total time: 0:00:12 (0.2074 s / it) Averaged stats (hcp-val): loss: 0.8317 (0.8335) Making plots (hcp-val): example=25 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [21] [ 0/6250] eta: 11:09:42 lr: 0.000116 grad: 0.1017 (0.1017) loss: 0.8293 (0.8293) time: 6.4292 data: 6.1545 max mem: 26157 Train: [21] [ 100/6250] eta: 0:33:57 lr: 0.000116 grad: 0.0803 (0.0876) loss: 0.8289 (0.8340) time: 0.2691 data: 0.0002 max mem: 26157 Train: [21] [ 200/6250] eta: 0:30:19 lr: 0.000116 grad: 0.0782 (0.0836) loss: 0.8242 (0.8280) time: 0.2686 data: 0.0002 max mem: 26157 Train: [21] [ 300/6250] eta: 0:28:47 lr: 0.000116 grad: 0.0720 (0.0814) loss: 0.8237 (0.8263) time: 0.2679 data: 0.0002 max mem: 26157 Train: [21] [ 400/6250] eta: 0:27:47 lr: 0.000116 grad: 0.0707 (0.0797) loss: 0.8240 (0.8261) time: 0.2693 data: 0.0002 max mem: 26157 Train: [21] [ 500/6250] eta: 0:27:02 lr: 0.000116 grad: 0.0700 (0.0790) loss: 0.8266 (0.8261) time: 0.2701 data: 0.0002 max mem: 26157 Train: [21] [ 600/6250] eta: 0:26:21 lr: 0.000116 grad: 0.0726 (0.0779) loss: 0.8249 (0.8260) time: 0.2691 data: 0.0002 max mem: 26157 Train: [21] [ 700/6250] eta: 0:25:45 lr: 0.000116 grad: 0.0767 (0.0776) loss: 0.8219 (0.8255) time: 0.2700 data: 0.0002 max mem: 26157 Train: [21] [ 800/6250] eta: 0:25:11 lr: 0.000116 grad: 0.0771 (0.0773) loss: 0.8302 (0.8256) time: 0.2701 data: 0.0002 max mem: 26157 Train: [21] [ 900/6250] eta: 0:24:38 lr: 0.000116 grad: 0.0692 (0.0771) loss: 0.8260 (0.8255) time: 0.2699 data: 0.0002 max mem: 26157 Train: [21] [1000/6250] eta: 0:24:07 lr: 0.000116 grad: 0.0737 (0.0769) loss: 0.8244 (0.8255) time: 0.2682 data: 0.0003 max mem: 26157 Train: [21] [1100/6250] eta: 0:23:36 lr: 0.000116 grad: 0.0752 (0.0769) loss: 0.8159 (0.8253) time: 0.2685 data: 0.0002 max mem: 26157 Train: [21] [1200/6250] eta: 0:23:06 lr: 0.000116 grad: 0.0775 (0.0769) loss: 0.8203 (0.8252) time: 0.2698 data: 0.0002 max mem: 26157 Train: [21] [1300/6250] eta: 0:22:37 lr: 0.000116 grad: 0.0712 (0.0769) loss: 0.8166 (0.8247) time: 0.2691 data: 0.0002 max mem: 26157 Train: [21] [1400/6250] eta: 0:22:08 lr: 0.000116 grad: 0.0732 (0.0769) loss: 0.8142 (0.8246) time: 0.2687 data: 0.0002 max mem: 26157 Train: [21] [1500/6250] eta: 0:21:39 lr: 0.000116 grad: 0.0733 (0.0771) loss: 0.8145 (0.8243) time: 0.2696 data: 0.0002 max mem: 26157 Train: [21] [1600/6250] eta: 0:21:16 lr: 0.000116 grad: 0.0712 (0.0772) loss: 0.8199 (0.8241) time: 0.2722 data: 0.0002 max mem: 26157 Train: [21] [1700/6250] eta: 0:20:54 lr: 0.000116 grad: 0.0761 (0.0771) loss: 0.8251 (0.8240) time: 0.2714 data: 0.0020 max mem: 26157 Train: [21] [1800/6250] eta: 0:20:26 lr: 0.000116 grad: 0.0752 (0.0771) loss: 0.8186 (0.8239) time: 0.2767 data: 0.0002 max mem: 26157 Train: [21] [1900/6250] eta: 0:20:05 lr: 0.000116 grad: 0.0711 (0.0770) loss: 0.8231 (0.8238) time: 0.2707 data: 0.0002 max mem: 26157 Train: [21] [2000/6250] eta: 0:19:36 lr: 0.000116 grad: 0.0710 (0.0768) loss: 0.8219 (0.8238) time: 0.2703 data: 0.0002 max mem: 26157 Train: [21] [2100/6250] eta: 0:19:07 lr: 0.000116 grad: 0.0771 (0.0768) loss: 0.8267 (0.8238) time: 0.2708 data: 0.0002 max mem: 26157 Train: [21] [2200/6250] eta: 0:18:38 lr: 0.000116 grad: 0.0768 (0.0767) loss: 0.8180 (0.8238) time: 0.2709 data: 0.0002 max mem: 26157 Train: [21] [2300/6250] eta: 0:18:09 lr: 0.000116 grad: 0.0720 (0.0767) loss: 0.8214 (0.8238) time: 0.2697 data: 0.0002 max mem: 26157 Train: [21] [2400/6250] eta: 0:17:41 lr: 0.000116 grad: 0.0738 (0.0768) loss: 0.8295 (0.8237) time: 0.2698 data: 0.0002 max mem: 26157 Train: [21] [2500/6250] eta: 0:17:12 lr: 0.000116 grad: 0.0816 (0.0769) loss: 0.8268 (0.8237) time: 0.2690 data: 0.0002 max mem: 26157 Train: [21] [2600/6250] eta: 0:16:44 lr: 0.000116 grad: 0.0778 (0.0769) loss: 0.8148 (0.8236) time: 0.2704 data: 0.0002 max mem: 26157 Train: [21] [2700/6250] eta: 0:16:16 lr: 0.000116 grad: 0.0724 (0.0769) loss: 0.8223 (0.8236) time: 0.2732 data: 0.0002 max mem: 26157 Train: [21] [2800/6250] eta: 0:15:49 lr: 0.000116 grad: 0.0799 (0.0770) loss: 0.8273 (0.8235) time: 0.2685 data: 0.0002 max mem: 26157 Train: [21] [2900/6250] eta: 0:15:21 lr: 0.000116 grad: 0.0740 (0.0770) loss: 0.8245 (0.8235) time: 0.2710 data: 0.0002 max mem: 26157 Train: [21] [3000/6250] eta: 0:14:53 lr: 0.000116 grad: 0.0808 (0.0771) loss: 0.8187 (0.8234) time: 0.2710 data: 0.0002 max mem: 26157 Train: [21] [3100/6250] eta: 0:14:26 lr: 0.000116 grad: 0.0761 (0.0771) loss: 0.8204 (0.8234) time: 0.2702 data: 0.0002 max mem: 26157 Train: [21] [3200/6250] eta: 0:13:58 lr: 0.000116 grad: 0.0769 (0.0772) loss: 0.8225 (0.8234) time: 0.2698 data: 0.0002 max mem: 26157 Train: [21] [3300/6250] eta: 0:13:30 lr: 0.000116 grad: 0.0802 (0.0773) loss: 0.8231 (0.8233) time: 0.2691 data: 0.0002 max mem: 26157 Train: [21] [3400/6250] eta: 0:13:02 lr: 0.000116 grad: 0.0760 (0.0774) loss: 0.8205 (0.8233) time: 0.2686 data: 0.0002 max mem: 26157 Train: [21] [3500/6250] eta: 0:12:34 lr: 0.000116 grad: 0.0718 (0.0774) loss: 0.8290 (0.8233) time: 0.2701 data: 0.0002 max mem: 26157 Train: [21] [3600/6250] eta: 0:12:06 lr: 0.000116 grad: 0.0681 (0.0776) loss: 0.8190 (0.8233) time: 0.2705 data: 0.0002 max mem: 26157 Train: [21] [3700/6250] eta: 0:11:39 lr: 0.000116 grad: 0.0738 (0.0776) loss: 0.8287 (0.8233) time: 0.2696 data: 0.0002 max mem: 26157 Train: [21] [3800/6250] eta: 0:11:11 lr: 0.000116 grad: 0.0805 (0.0776) loss: 0.8172 (0.8233) time: 0.2716 data: 0.0002 max mem: 26157 Train: [21] [3900/6250] eta: 0:10:43 lr: 0.000116 grad: 0.0776 (0.0777) loss: 0.8263 (0.8233) time: 0.2688 data: 0.0002 max mem: 26157 Train: [21] [4000/6250] eta: 0:10:16 lr: 0.000116 grad: 0.0781 (0.0777) loss: 0.8237 (0.8234) time: 0.2683 data: 0.0001 max mem: 26157 Train: [21] [4100/6250] eta: 0:09:48 lr: 0.000116 grad: 0.0758 (0.0777) loss: 0.8225 (0.8234) time: 0.2688 data: 0.0001 max mem: 26157 Train: [21] [4200/6250] eta: 0:09:20 lr: 0.000116 grad: 0.0757 (0.0777) loss: 0.8297 (0.8234) time: 0.2685 data: 0.0002 max mem: 26157 Train: [21] [4300/6250] eta: 0:08:53 lr: 0.000116 grad: 0.0755 (0.0777) loss: 0.8296 (0.8235) time: 0.2703 data: 0.0002 max mem: 26157 Train: [21] [4400/6250] eta: 0:08:25 lr: 0.000116 grad: 0.0698 (0.0777) loss: 0.8241 (0.8235) time: 0.2679 data: 0.0002 max mem: 26157 Train: [21] [4500/6250] eta: 0:07:58 lr: 0.000116 grad: 0.0733 (0.0777) loss: 0.8251 (0.8236) time: 0.2691 data: 0.0002 max mem: 26157 Train: [21] [4600/6250] eta: 0:07:30 lr: 0.000116 grad: 0.0727 (0.0776) loss: 0.8298 (0.8236) time: 0.2704 data: 0.0002 max mem: 26157 Train: [21] [4700/6250] eta: 0:07:03 lr: 0.000116 grad: 0.0803 (0.0777) loss: 0.8265 (0.8236) time: 0.2699 data: 0.0002 max mem: 26157 Train: [21] [4800/6250] eta: 0:06:35 lr: 0.000116 grad: 0.0784 (0.0777) loss: 0.8182 (0.8236) time: 0.2689 data: 0.0001 max mem: 26157 Train: [21] [4900/6250] eta: 0:06:08 lr: 0.000116 grad: 0.0732 (0.0778) loss: 0.8234 (0.8236) time: 0.2699 data: 0.0002 max mem: 26157 Train: [21] [5000/6250] eta: 0:05:41 lr: 0.000116 grad: 0.0762 (0.0778) loss: 0.8233 (0.8236) time: 0.2695 data: 0.0002 max mem: 26157 Train: [21] [5100/6250] eta: 0:05:13 lr: 0.000116 grad: 0.0711 (0.0778) loss: 0.8247 (0.8236) time: 0.2704 data: 0.0002 max mem: 26157 Train: [21] [5200/6250] eta: 0:04:46 lr: 0.000116 grad: 0.0696 (0.0778) loss: 0.8211 (0.8236) time: 0.2691 data: 0.0002 max mem: 26157 Train: [21] [5300/6250] eta: 0:04:19 lr: 0.000116 grad: 0.0742 (0.0777) loss: 0.8237 (0.8236) time: 0.2677 data: 0.0002 max mem: 26157 Train: [21] [5400/6250] eta: 0:03:51 lr: 0.000116 grad: 0.0752 (0.0778) loss: 0.8169 (0.8236) time: 0.2700 data: 0.0002 max mem: 26157 Train: [21] [5500/6250] eta: 0:03:24 lr: 0.000116 grad: 0.0750 (0.0778) loss: 0.8257 (0.8236) time: 0.2712 data: 0.0002 max mem: 26157 Train: [21] [5600/6250] eta: 0:02:57 lr: 0.000115 grad: 0.0700 (0.0778) loss: 0.8293 (0.8236) time: 0.2690 data: 0.0002 max mem: 26157 Train: [21] [5700/6250] eta: 0:02:30 lr: 0.000115 grad: 0.0801 (0.0777) loss: 0.8245 (0.8237) time: 0.2708 data: 0.0002 max mem: 26157 Train: [21] [5800/6250] eta: 0:02:02 lr: 0.000115 grad: 0.0722 (0.0778) loss: 0.8270 (0.8237) time: 0.2709 data: 0.0002 max mem: 26157 Train: [21] [5900/6250] eta: 0:01:35 lr: 0.000115 grad: 0.0762 (0.0778) loss: 0.8233 (0.8237) time: 0.2764 data: 0.0002 max mem: 26157 Train: [21] [6000/6250] eta: 0:01:08 lr: 0.000115 grad: 0.0780 (0.0777) loss: 0.8277 (0.8237) time: 0.2695 data: 0.0002 max mem: 26157 Train: [21] [6100/6250] eta: 0:00:40 lr: 0.000115 grad: 0.0725 (0.0777) loss: 0.8313 (0.8238) time: 0.2707 data: 0.0002 max mem: 26157 Train: [21] [6200/6250] eta: 0:00:13 lr: 0.000115 grad: 0.0769 (0.0777) loss: 0.8253 (0.8239) time: 0.3031 data: 0.0316 max mem: 26157 Train: [21] [6249/6250] eta: 0:00:00 lr: 0.000115 grad: 0.0748 (0.0777) loss: 0.8287 (0.8239) time: 0.2734 data: 0.0002 max mem: 26157 Train: [21] Total time: 0:28:35 (0.2745 s / it) Averaged stats: lr: 0.000115 grad: 0.0748 (0.0777) loss: 0.8287 (0.8239) Eval (hcp-train-subset): [21] [ 0/62] eta: 0:04:16 loss: 0.8487 (0.8487) time: 4.1352 data: 4.0223 max mem: 26157 Eval (hcp-train-subset): [21] [61/62] eta: 0:00:00 loss: 0.8368 (0.8374) time: 0.1228 data: 0.0382 max mem: 26157 Eval (hcp-train-subset): [21] Total time: 0:00:13 (0.2130 s / it) Averaged stats (hcp-train-subset): loss: 0.8368 (0.8374) Making plots (hcp-train-subset): example=56 Eval (hcp-val): [21] [ 0/62] eta: 0:04:38 loss: 0.8295 (0.8295) time: 4.4986 data: 4.4147 max mem: 26157 Eval (hcp-val): [21] [61/62] eta: 0:00:00 loss: 0.8312 (0.8327) time: 0.1360 data: 0.0531 max mem: 26157 Eval (hcp-val): [21] Total time: 0:00:13 (0.2138 s / it) Averaged stats (hcp-val): loss: 0.8312 (0.8327) Making plots (hcp-val): example=26 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [22] [ 0/6250] eta: 8:44:51 lr: 0.000115 grad: 0.1219 (0.1219) loss: 0.8218 (0.8218) time: 5.0387 data: 4.6837 max mem: 26157 Train: [22] [ 100/6250] eta: 0:33:05 lr: 0.000115 grad: 0.0818 (0.0894) loss: 0.8220 (0.8275) time: 0.2709 data: 0.0002 max mem: 26157 Train: [22] [ 200/6250] eta: 0:29:54 lr: 0.000115 grad: 0.0847 (0.0897) loss: 0.8074 (0.8214) time: 0.2687 data: 0.0002 max mem: 26157 Train: [22] [ 300/6250] eta: 0:28:33 lr: 0.000115 grad: 0.0809 (0.0882) loss: 0.8163 (0.8188) time: 0.2703 data: 0.0002 max mem: 26157 Train: [22] [ 400/6250] eta: 0:27:39 lr: 0.000115 grad: 0.0794 (0.0870) loss: 0.8178 (0.8182) time: 0.2718 data: 0.0002 max mem: 26157 Train: [22] [ 500/6250] eta: 0:26:54 lr: 0.000115 grad: 0.0784 (0.0863) loss: 0.8188 (0.8180) time: 0.2692 data: 0.0002 max mem: 26157 Train: [22] [ 600/6250] eta: 0:26:15 lr: 0.000115 grad: 0.0746 (0.0860) loss: 0.8209 (0.8176) time: 0.2683 data: 0.0002 max mem: 26157 Train: [22] [ 700/6250] eta: 0:25:41 lr: 0.000115 grad: 0.0745 (0.0848) loss: 0.8232 (0.8181) time: 0.2706 data: 0.0002 max mem: 26157 Train: [22] [ 800/6250] eta: 0:25:07 lr: 0.000115 grad: 0.0733 (0.0840) loss: 0.8270 (0.8183) time: 0.2678 data: 0.0002 max mem: 26157 Train: [22] [ 900/6250] eta: 0:24:36 lr: 0.000115 grad: 0.0746 (0.0834) loss: 0.8203 (0.8188) time: 0.2727 data: 0.0002 max mem: 26157 Train: [22] [1000/6250] eta: 0:24:17 lr: 0.000115 grad: 0.0785 (0.0827) loss: 0.8232 (0.8191) time: 0.2722 data: 0.0002 max mem: 26157 Train: [22] [1100/6250] eta: 0:23:45 lr: 0.000115 grad: 0.0738 (0.0824) loss: 0.8231 (0.8193) time: 0.2694 data: 0.0002 max mem: 26157 Train: [22] [1200/6250] eta: 0:23:16 lr: 0.000115 grad: 0.0790 (0.0823) loss: 0.8185 (0.8193) time: 0.2701 data: 0.0003 max mem: 26157 Train: [22] [1300/6250] eta: 0:22:46 lr: 0.000115 grad: 0.0752 (0.0821) loss: 0.8163 (0.8193) time: 0.2689 data: 0.0002 max mem: 26157 Train: [22] [1400/6250] eta: 0:22:16 lr: 0.000115 grad: 0.0767 (0.0820) loss: 0.8219 (0.8193) time: 0.2710 data: 0.0015 max mem: 26157 Train: [22] [1500/6250] eta: 0:21:46 lr: 0.000115 grad: 0.0749 (0.0817) loss: 0.8223 (0.8194) time: 0.2697 data: 0.0002 max mem: 26157 Train: [22] [1600/6250] eta: 0:21:18 lr: 0.000115 grad: 0.0870 (0.0817) loss: 0.8208 (0.8195) time: 0.2695 data: 0.0002 max mem: 26157 Train: [22] [1700/6250] eta: 0:20:49 lr: 0.000115 grad: 0.0793 (0.0816) loss: 0.8183 (0.8194) time: 0.2756 data: 0.0002 max mem: 26157 Train: [22] [1800/6250] eta: 0:20:20 lr: 0.000115 grad: 0.0818 (0.0815) loss: 0.8195 (0.8194) time: 0.2700 data: 0.0002 max mem: 26157 Train: [22] [1900/6250] eta: 0:19:52 lr: 0.000115 grad: 0.0788 (0.0814) loss: 0.8200 (0.8195) time: 0.2692 data: 0.0002 max mem: 26157 Train: [22] [2000/6250] eta: 0:19:23 lr: 0.000115 grad: 0.0780 (0.0814) loss: 0.8177 (0.8193) time: 0.2685 data: 0.0002 max mem: 26157 Train: [22] [2100/6250] eta: 0:18:55 lr: 0.000115 grad: 0.0804 (0.0814) loss: 0.8172 (0.8194) time: 0.2683 data: 0.0001 max mem: 26157 Train: [22] [2200/6250] eta: 0:18:27 lr: 0.000115 grad: 0.0829 (0.0815) loss: 0.8170 (0.8193) time: 0.2702 data: 0.0002 max mem: 26157 Train: [22] [2300/6250] eta: 0:17:59 lr: 0.000115 grad: 0.0795 (0.0814) loss: 0.8185 (0.8193) time: 0.2705 data: 0.0002 max mem: 26157 Train: [22] [2400/6250] eta: 0:17:31 lr: 0.000115 grad: 0.0770 (0.0814) loss: 0.8201 (0.8193) time: 0.2689 data: 0.0002 max mem: 26157 Train: [22] [2500/6250] eta: 0:17:03 lr: 0.000115 grad: 0.0777 (0.0813) loss: 0.8170 (0.8193) time: 0.2681 data: 0.0002 max mem: 26157 Train: [22] [2600/6250] eta: 0:16:35 lr: 0.000115 grad: 0.0810 (0.0814) loss: 0.8221 (0.8192) time: 0.2700 data: 0.0002 max mem: 26157 Train: [22] [2700/6250] eta: 0:16:07 lr: 0.000115 grad: 0.0827 (0.0815) loss: 0.8131 (0.8191) time: 0.2678 data: 0.0002 max mem: 26157 Train: [22] [2800/6250] eta: 0:15:40 lr: 0.000115 grad: 0.0815 (0.0818) loss: 0.8151 (0.8191) time: 0.2696 data: 0.0002 max mem: 26157 Train: [22] [2900/6250] eta: 0:15:12 lr: 0.000115 grad: 0.0822 (0.0819) loss: 0.8127 (0.8190) time: 0.2711 data: 0.0002 max mem: 26157 Train: [22] [3000/6250] eta: 0:14:45 lr: 0.000115 grad: 0.0776 (0.0819) loss: 0.8224 (0.8190) time: 0.2690 data: 0.0002 max mem: 26157 Train: [22] [3100/6250] eta: 0:14:23 lr: 0.000115 grad: 0.0809 (0.0820) loss: 0.8133 (0.8190) time: 0.5392 data: 0.2647 max mem: 26157 Train: [22] [3200/6250] eta: 0:13:55 lr: 0.000115 grad: 0.0798 (0.0820) loss: 0.8182 (0.8190) time: 0.2701 data: 0.0002 max mem: 26157 Train: [22] [3300/6250] eta: 0:13:27 lr: 0.000115 grad: 0.0767 (0.0820) loss: 0.8239 (0.8190) time: 0.2703 data: 0.0002 max mem: 26157 Train: [22] [3400/6250] eta: 0:12:59 lr: 0.000115 grad: 0.0839 (0.0820) loss: 0.8211 (0.8190) time: 0.2693 data: 0.0002 max mem: 26157 Train: [22] [3500/6250] eta: 0:12:32 lr: 0.000115 grad: 0.0763 (0.0820) loss: 0.8247 (0.8191) time: 0.2694 data: 0.0001 max mem: 26157 Train: [22] [3600/6250] eta: 0:12:05 lr: 0.000115 grad: 0.0815 (0.0820) loss: 0.8266 (0.8191) time: 0.2686 data: 0.0002 max mem: 26157 Train: [22] [3700/6250] eta: 0:11:37 lr: 0.000115 grad: 0.0769 (0.0820) loss: 0.8255 (0.8192) time: 0.2757 data: 0.0003 max mem: 26157 Train: [22] [3800/6250] eta: 0:11:10 lr: 0.000115 grad: 0.0782 (0.0820) loss: 0.8175 (0.8192) time: 0.2760 data: 0.0002 max mem: 26157 Train: [22] [3900/6250] eta: 0:10:43 lr: 0.000115 grad: 0.0811 (0.0820) loss: 0.8168 (0.8193) time: 0.2712 data: 0.0002 max mem: 26157 Train: [22] [4000/6250] eta: 0:10:15 lr: 0.000115 grad: 0.0791 (0.0819) loss: 0.8253 (0.8193) time: 0.2706 data: 0.0002 max mem: 26157 Train: [22] [4100/6250] eta: 0:09:47 lr: 0.000115 grad: 0.0818 (0.0819) loss: 0.8238 (0.8194) time: 0.2698 data: 0.0002 max mem: 26157 Train: [22] [4200/6250] eta: 0:09:20 lr: 0.000115 grad: 0.0774 (0.0819) loss: 0.8286 (0.8195) time: 0.2704 data: 0.0002 max mem: 26157 Train: [22] [4300/6250] eta: 0:08:53 lr: 0.000115 grad: 0.0826 (0.0818) loss: 0.8180 (0.8195) time: 0.2704 data: 0.0002 max mem: 26157 Train: [22] [4400/6250] eta: 0:08:25 lr: 0.000115 grad: 0.0730 (0.0818) loss: 0.8219 (0.8196) time: 0.2717 data: 0.0002 max mem: 26157 Train: [22] [4500/6250] eta: 0:07:58 lr: 0.000115 grad: 0.0767 (0.0818) loss: 0.8211 (0.8196) time: 0.2707 data: 0.0003 max mem: 26157 Train: [22] [4600/6250] eta: 0:07:30 lr: 0.000115 grad: 0.0766 (0.0817) loss: 0.8262 (0.8197) time: 0.2684 data: 0.0002 max mem: 26157 Train: [22] [4700/6250] eta: 0:07:03 lr: 0.000115 grad: 0.0818 (0.0817) loss: 0.8233 (0.8198) time: 0.2687 data: 0.0002 max mem: 26157 Train: [22] [4800/6250] eta: 0:06:35 lr: 0.000115 grad: 0.0780 (0.0817) loss: 0.8178 (0.8198) time: 0.2697 data: 0.0002 max mem: 26157 Train: [22] [4900/6250] eta: 0:06:08 lr: 0.000114 grad: 0.0791 (0.0817) loss: 0.8205 (0.8199) time: 0.2712 data: 0.0002 max mem: 26157 Train: [22] [5000/6250] eta: 0:05:41 lr: 0.000114 grad: 0.0833 (0.0817) loss: 0.8250 (0.8199) time: 0.2707 data: 0.0002 max mem: 26157 Train: [22] [5100/6250] eta: 0:05:13 lr: 0.000114 grad: 0.0746 (0.0817) loss: 0.8237 (0.8199) time: 0.2685 data: 0.0002 max mem: 26157 Train: [22] [5200/6250] eta: 0:04:46 lr: 0.000114 grad: 0.0777 (0.0817) loss: 0.8236 (0.8200) time: 0.2689 data: 0.0002 max mem: 26157 Train: [22] [5300/6250] eta: 0:04:19 lr: 0.000114 grad: 0.0789 (0.0816) loss: 0.8230 (0.8200) time: 0.2693 data: 0.0002 max mem: 26157 Train: [22] [5400/6250] eta: 0:03:51 lr: 0.000114 grad: 0.0780 (0.0816) loss: 0.8242 (0.8200) time: 0.2689 data: 0.0002 max mem: 26157 Train: [22] [5500/6250] eta: 0:03:24 lr: 0.000114 grad: 0.0780 (0.0815) loss: 0.8252 (0.8201) time: 0.2707 data: 0.0002 max mem: 26157 Train: [22] [5600/6250] eta: 0:02:57 lr: 0.000114 grad: 0.0803 (0.0815) loss: 0.8283 (0.8201) time: 0.2699 data: 0.0002 max mem: 26157 Train: [22] [5700/6250] eta: 0:02:29 lr: 0.000114 grad: 0.0774 (0.0815) loss: 0.8240 (0.8202) time: 0.2684 data: 0.0002 max mem: 26157 Train: [22] [5800/6250] eta: 0:02:02 lr: 0.000114 grad: 0.0781 (0.0815) loss: 0.8258 (0.8202) time: 0.2706 data: 0.0002 max mem: 26157 Train: [22] [5900/6250] eta: 0:01:35 lr: 0.000114 grad: 0.0841 (0.0815) loss: 0.8198 (0.8202) time: 0.2690 data: 0.0002 max mem: 26157 Train: [22] [6000/6250] eta: 0:01:08 lr: 0.000114 grad: 0.0785 (0.0815) loss: 0.8289 (0.8203) time: 0.2694 data: 0.0002 max mem: 26157 Train: [22] [6100/6250] eta: 0:00:40 lr: 0.000114 grad: 0.0817 (0.0815) loss: 0.8231 (0.8203) time: 0.2685 data: 0.0002 max mem: 26157 Train: [22] [6200/6250] eta: 0:00:13 lr: 0.000114 grad: 0.0799 (0.0816) loss: 0.8287 (0.8203) time: 0.2695 data: 0.0002 max mem: 26157 Train: [22] [6249/6250] eta: 0:00:00 lr: 0.000114 grad: 0.0784 (0.0816) loss: 0.8207 (0.8204) time: 0.2713 data: 0.0002 max mem: 26157 Train: [22] Total time: 0:28:27 (0.2732 s / it) Averaged stats: lr: 0.000114 grad: 0.0784 (0.0816) loss: 0.8207 (0.8204) Eval (hcp-train-subset): [22] [ 0/62] eta: 0:03:16 loss: 0.8445 (0.8445) time: 3.1659 data: 3.0461 max mem: 26157 Eval (hcp-train-subset): [22] [61/62] eta: 0:00:00 loss: 0.8338 (0.8350) time: 0.1231 data: 0.0400 max mem: 26157 Eval (hcp-train-subset): [22] Total time: 0:00:12 (0.2012 s / it) Averaged stats (hcp-train-subset): loss: 0.8338 (0.8350) Making plots (hcp-train-subset): example=9 Eval (hcp-val): [22] [ 0/62] eta: 0:05:13 loss: 0.8274 (0.8274) time: 5.0510 data: 4.9670 max mem: 26157 Eval (hcp-val): [22] [61/62] eta: 0:00:00 loss: 0.8325 (0.8323) time: 0.1348 data: 0.0497 max mem: 26157 Eval (hcp-val): [22] Total time: 0:00:12 (0.2034 s / it) Averaged stats (hcp-val): loss: 0.8325 (0.8323) Making plots (hcp-val): example=54 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [23] [ 0/6250] eta: 7:50:31 lr: 0.000114 grad: 0.1248 (0.1248) loss: 0.8269 (0.8269) time: 4.5170 data: 4.1735 max mem: 26157 Train: [23] [ 100/6250] eta: 0:33:47 lr: 0.000114 grad: 0.0710 (0.0924) loss: 0.8299 (0.8291) time: 0.2707 data: 0.0002 max mem: 26157 Train: [23] [ 200/6250] eta: 0:30:19 lr: 0.000114 grad: 0.0755 (0.0860) loss: 0.8205 (0.8276) time: 0.2725 data: 0.0002 max mem: 26157 Train: [23] [ 300/6250] eta: 0:29:38 lr: 0.000114 grad: 0.0741 (0.0832) loss: 0.8220 (0.8281) time: 0.3734 data: 0.1018 max mem: 26157 Train: [23] [ 400/6250] eta: 0:28:29 lr: 0.000114 grad: 0.0683 (0.0814) loss: 0.8313 (0.8283) time: 0.2749 data: 0.0004 max mem: 26157 Train: [23] [ 500/6250] eta: 0:27:45 lr: 0.000114 grad: 0.0743 (0.0813) loss: 0.8273 (0.8280) time: 0.3132 data: 0.0351 max mem: 26157 Train: [23] [ 600/6250] eta: 0:27:00 lr: 0.000114 grad: 0.0744 (0.0810) loss: 0.8233 (0.8275) time: 0.2717 data: 0.0002 max mem: 26157 Train: [23] [ 700/6250] eta: 0:26:20 lr: 0.000114 grad: 0.0787 (0.0806) loss: 0.8268 (0.8273) time: 0.2728 data: 0.0002 max mem: 26157 Train: [23] [ 800/6250] eta: 0:25:43 lr: 0.000114 grad: 0.0789 (0.0803) loss: 0.8254 (0.8274) time: 0.2740 data: 0.0002 max mem: 26157 Train: [23] [ 900/6250] eta: 0:25:09 lr: 0.000114 grad: 0.0750 (0.0799) loss: 0.8223 (0.8273) time: 0.2713 data: 0.0002 max mem: 26157 Train: [23] [1000/6250] eta: 0:24:58 lr: 0.000114 grad: 0.0728 (0.0795) loss: 0.8203 (0.8270) time: 0.4858 data: 0.2161 max mem: 26157 Train: [23] [1100/6250] eta: 0:24:22 lr: 0.000114 grad: 0.0712 (0.0791) loss: 0.8323 (0.8269) time: 0.2686 data: 0.0002 max mem: 26157 Train: [23] [1200/6250] eta: 0:23:48 lr: 0.000114 grad: 0.0782 (0.0793) loss: 0.8248 (0.8265) time: 0.2684 data: 0.0002 max mem: 26157 Train: [23] [1300/6250] eta: 0:23:15 lr: 0.000114 grad: 0.0740 (0.0792) loss: 0.8236 (0.8263) time: 0.2694 data: 0.0002 max mem: 26157 Train: [23] [1400/6250] eta: 0:22:42 lr: 0.000114 grad: 0.0748 (0.0792) loss: 0.8225 (0.8260) time: 0.2684 data: 0.0002 max mem: 26157 Train: [23] [1500/6250] eta: 0:22:11 lr: 0.000114 grad: 0.0789 (0.0795) loss: 0.8172 (0.8258) time: 0.2695 data: 0.0002 max mem: 26157 Train: [23] [1600/6250] eta: 0:21:40 lr: 0.000114 grad: 0.0849 (0.0797) loss: 0.8217 (0.8255) time: 0.2707 data: 0.0002 max mem: 26157 Train: [23] [1700/6250] eta: 0:21:30 lr: 0.000114 grad: 0.0748 (0.0797) loss: 0.8229 (0.8252) time: 0.6389 data: 0.3603 max mem: 26157 Train: [23] [1800/6250] eta: 0:20:58 lr: 0.000114 grad: 0.0805 (0.0798) loss: 0.8235 (0.8248) time: 0.2707 data: 0.0002 max mem: 26157 Train: [23] [1900/6250] eta: 0:20:27 lr: 0.000114 grad: 0.0769 (0.0798) loss: 0.8199 (0.8245) time: 0.2709 data: 0.0002 max mem: 26157 Train: [23] [2000/6250] eta: 0:20:03 lr: 0.000114 grad: 0.0848 (0.0799) loss: 0.8170 (0.8243) time: 0.2704 data: 0.0002 max mem: 26157 Train: [23] [2100/6250] eta: 0:19:32 lr: 0.000114 grad: 0.0760 (0.0800) loss: 0.8198 (0.8241) time: 0.2691 data: 0.0002 max mem: 26157 Train: [23] [2200/6250] eta: 0:19:01 lr: 0.000114 grad: 0.0798 (0.0800) loss: 0.8207 (0.8240) time: 0.2680 data: 0.0002 max mem: 26157 Train: [23] [2300/6250] eta: 0:18:31 lr: 0.000114 grad: 0.0770 (0.0801) loss: 0.8214 (0.8239) time: 0.2687 data: 0.0002 max mem: 26157 Train: [23] [2400/6250] eta: 0:18:01 lr: 0.000114 grad: 0.0750 (0.0800) loss: 0.8232 (0.8238) time: 0.2698 data: 0.0002 max mem: 26157 Train: [23] [2500/6250] eta: 0:17:32 lr: 0.000114 grad: 0.0771 (0.0800) loss: 0.8220 (0.8238) time: 0.2694 data: 0.0002 max mem: 26157 Train: [23] [2600/6250] eta: 0:17:02 lr: 0.000114 grad: 0.0805 (0.0800) loss: 0.8247 (0.8237) time: 0.2714 data: 0.0002 max mem: 26157 Train: [23] [2700/6250] eta: 0:16:33 lr: 0.000114 grad: 0.0769 (0.0800) loss: 0.8224 (0.8238) time: 0.2706 data: 0.0002 max mem: 26157 Train: [23] [2800/6250] eta: 0:16:04 lr: 0.000114 grad: 0.0770 (0.0800) loss: 0.8250 (0.8238) time: 0.2737 data: 0.0003 max mem: 26157 Train: [23] [2900/6250] eta: 0:15:35 lr: 0.000114 grad: 0.0797 (0.0800) loss: 0.8236 (0.8238) time: 0.2740 data: 0.0002 max mem: 26157 Train: [23] [3000/6250] eta: 0:15:20 lr: 0.000114 grad: 0.0829 (0.0799) loss: 0.8209 (0.8238) time: 0.2785 data: 0.0002 max mem: 26157 Train: [23] [3100/6250] eta: 0:14:51 lr: 0.000114 grad: 0.0805 (0.0799) loss: 0.8230 (0.8238) time: 0.2710 data: 0.0002 max mem: 26157 Train: [23] [3200/6250] eta: 0:14:21 lr: 0.000114 grad: 0.0750 (0.0799) loss: 0.8249 (0.8237) time: 0.2687 data: 0.0002 max mem: 26157 Train: [23] [3300/6250] eta: 0:13:52 lr: 0.000114 grad: 0.0849 (0.0799) loss: 0.8192 (0.8236) time: 0.2695 data: 0.0002 max mem: 26157 Train: [23] [3400/6250] eta: 0:13:23 lr: 0.000114 grad: 0.0775 (0.0799) loss: 0.8220 (0.8235) time: 0.2685 data: 0.0002 max mem: 26157 Train: [23] [3500/6250] eta: 0:12:53 lr: 0.000114 grad: 0.0803 (0.0799) loss: 0.8145 (0.8234) time: 0.2704 data: 0.0002 max mem: 26157 Train: [23] [3600/6250] eta: 0:12:24 lr: 0.000114 grad: 0.0833 (0.0801) loss: 0.8227 (0.8234) time: 0.2679 data: 0.0002 max mem: 26157 Train: [23] [3700/6250] eta: 0:11:55 lr: 0.000114 grad: 0.0812 (0.0801) loss: 0.8218 (0.8233) time: 0.2690 data: 0.0002 max mem: 26157 Train: [23] [3800/6250] eta: 0:11:27 lr: 0.000114 grad: 0.0715 (0.0802) loss: 0.8246 (0.8233) time: 0.2697 data: 0.0002 max mem: 26157 Train: [23] [3900/6250] eta: 0:10:58 lr: 0.000114 grad: 0.0784 (0.0802) loss: 0.8216 (0.8233) time: 0.2691 data: 0.0002 max mem: 26157 Train: [23] [4000/6250] eta: 0:10:29 lr: 0.000113 grad: 0.0776 (0.0802) loss: 0.8213 (0.8232) time: 0.2704 data: 0.0002 max mem: 26157 Train: [23] [4100/6250] eta: 0:10:01 lr: 0.000113 grad: 0.0780 (0.0802) loss: 0.8236 (0.8232) time: 0.2686 data: 0.0002 max mem: 26157 Train: [23] [4200/6250] eta: 0:09:32 lr: 0.000113 grad: 0.0793 (0.0802) loss: 0.8170 (0.8231) time: 0.2690 data: 0.0002 max mem: 26157 Train: [23] [4300/6250] eta: 0:09:04 lr: 0.000113 grad: 0.0754 (0.0802) loss: 0.8207 (0.8231) time: 0.2685 data: 0.0002 max mem: 26157 Train: [23] [4400/6250] eta: 0:08:36 lr: 0.000113 grad: 0.0794 (0.0802) loss: 0.8128 (0.8230) time: 0.2708 data: 0.0002 max mem: 26157 Train: [23] [4500/6250] eta: 0:08:07 lr: 0.000113 grad: 0.0791 (0.0802) loss: 0.8210 (0.8229) time: 0.2682 data: 0.0002 max mem: 26157 Train: [23] [4600/6250] eta: 0:07:39 lr: 0.000113 grad: 0.0775 (0.0802) loss: 0.8236 (0.8229) time: 0.2707 data: 0.0002 max mem: 26157 Train: [23] [4700/6250] eta: 0:07:11 lr: 0.000113 grad: 0.0796 (0.0803) loss: 0.8196 (0.8229) time: 0.2690 data: 0.0002 max mem: 26157 Train: [23] [4800/6250] eta: 0:06:43 lr: 0.000113 grad: 0.0769 (0.0803) loss: 0.8220 (0.8228) time: 0.2695 data: 0.0002 max mem: 26157 Train: [23] [4900/6250] eta: 0:06:15 lr: 0.000113 grad: 0.0805 (0.0803) loss: 0.8162 (0.8228) time: 0.2729 data: 0.0003 max mem: 26157 Train: [23] [5000/6250] eta: 0:05:47 lr: 0.000113 grad: 0.0758 (0.0802) loss: 0.8256 (0.8229) time: 0.2756 data: 0.0002 max mem: 26157 Train: [23] [5100/6250] eta: 0:05:19 lr: 0.000113 grad: 0.0785 (0.0803) loss: 0.8247 (0.8228) time: 0.2700 data: 0.0002 max mem: 26157 Train: [23] [5200/6250] eta: 0:04:51 lr: 0.000113 grad: 0.0753 (0.0803) loss: 0.8215 (0.8228) time: 0.2690 data: 0.0002 max mem: 26157 Train: [23] [5300/6250] eta: 0:04:23 lr: 0.000113 grad: 0.0773 (0.0803) loss: 0.8237 (0.8227) time: 0.2722 data: 0.0002 max mem: 26157 Train: [23] [5400/6250] eta: 0:03:56 lr: 0.000113 grad: 0.0804 (0.0803) loss: 0.8232 (0.8227) time: 0.2734 data: 0.0002 max mem: 26157 Train: [23] [5500/6250] eta: 0:03:28 lr: 0.000113 grad: 0.0837 (0.0803) loss: 0.8165 (0.8227) time: 0.2691 data: 0.0002 max mem: 26157 Train: [23] [5600/6250] eta: 0:03:00 lr: 0.000113 grad: 0.0799 (0.0804) loss: 0.8219 (0.8227) time: 0.2686 data: 0.0002 max mem: 26157 Train: [23] [5700/6250] eta: 0:02:32 lr: 0.000113 grad: 0.0778 (0.0804) loss: 0.8246 (0.8227) time: 0.2692 data: 0.0002 max mem: 26157 Train: [23] [5800/6250] eta: 0:02:04 lr: 0.000113 grad: 0.0791 (0.0804) loss: 0.8172 (0.8227) time: 0.2696 data: 0.0002 max mem: 26157 Train: [23] [5900/6250] eta: 0:01:37 lr: 0.000113 grad: 0.0819 (0.0804) loss: 0.8211 (0.8226) time: 0.2689 data: 0.0002 max mem: 26157 Train: [23] [6000/6250] eta: 0:01:09 lr: 0.000113 grad: 0.0758 (0.0804) loss: 0.8218 (0.8226) time: 0.2693 data: 0.0002 max mem: 26157 Train: [23] [6100/6250] eta: 0:00:41 lr: 0.000113 grad: 0.0769 (0.0804) loss: 0.8209 (0.8225) time: 0.2682 data: 0.0002 max mem: 26157 Train: [23] [6200/6250] eta: 0:00:13 lr: 0.000113 grad: 0.0794 (0.0805) loss: 0.8235 (0.8225) time: 0.2699 data: 0.0002 max mem: 26157 Train: [23] [6249/6250] eta: 0:00:00 lr: 0.000113 grad: 0.0728 (0.0805) loss: 0.8197 (0.8226) time: 0.2687 data: 0.0002 max mem: 26157 Train: [23] Total time: 0:28:56 (0.2779 s / it) Averaged stats: lr: 0.000113 grad: 0.0728 (0.0805) loss: 0.8197 (0.8226) Eval (hcp-train-subset): [23] [ 0/62] eta: 0:03:50 loss: 0.8437 (0.8437) time: 3.7174 data: 3.6079 max mem: 26157 Eval (hcp-train-subset): [23] [61/62] eta: 0:00:00 loss: 0.8335 (0.8355) time: 0.1103 data: 0.0259 max mem: 26157 Eval (hcp-train-subset): [23] Total time: 0:00:12 (0.2052 s / it) Averaged stats (hcp-train-subset): loss: 0.8335 (0.8355) Making plots (hcp-train-subset): example=44 Eval (hcp-val): [23] [ 0/62] eta: 0:03:48 loss: 0.8257 (0.8257) time: 3.6868 data: 3.5643 max mem: 26157 Eval (hcp-val): [23] [61/62] eta: 0:00:00 loss: 0.8310 (0.8321) time: 0.1289 data: 0.0455 max mem: 26157 Eval (hcp-val): [23] Total time: 0:00:13 (0.2216 s / it) Averaged stats (hcp-val): loss: 0.8310 (0.8321) Making plots (hcp-val): example=23 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [24] [ 0/6250] eta: 8:35:38 lr: 0.000113 grad: 0.0654 (0.0654) loss: 0.8552 (0.8552) time: 4.9502 data: 4.5488 max mem: 26157 Train: [24] [ 100/6250] eta: 0:33:10 lr: 0.000113 grad: 0.0742 (0.0898) loss: 0.8305 (0.8348) time: 0.2716 data: 0.0002 max mem: 26157 Train: [24] [ 200/6250] eta: 0:29:56 lr: 0.000113 grad: 0.0733 (0.0846) loss: 0.8323 (0.8302) time: 0.2686 data: 0.0002 max mem: 26157 Train: [24] [ 300/6250] eta: 0:28:31 lr: 0.000113 grad: 0.0758 (0.0832) loss: 0.8229 (0.8272) time: 0.2693 data: 0.0002 max mem: 26157 Train: [24] [ 400/6250] eta: 0:27:37 lr: 0.000113 grad: 0.0778 (0.0817) loss: 0.8140 (0.8252) time: 0.2751 data: 0.0002 max mem: 26157 Train: [24] [ 500/6250] eta: 0:26:53 lr: 0.000113 grad: 0.0736 (0.0807) loss: 0.8221 (0.8244) time: 0.2694 data: 0.0002 max mem: 26157 Train: [24] [ 600/6250] eta: 0:26:15 lr: 0.000113 grad: 0.0710 (0.0801) loss: 0.8251 (0.8240) time: 0.2688 data: 0.0002 max mem: 26157 Train: [24] [ 700/6250] eta: 0:25:41 lr: 0.000113 grad: 0.0738 (0.0801) loss: 0.8184 (0.8236) time: 0.2696 data: 0.0002 max mem: 26157 Train: [24] [ 800/6250] eta: 0:25:09 lr: 0.000113 grad: 0.0832 (0.0803) loss: 0.8191 (0.8233) time: 0.2713 data: 0.0002 max mem: 26157 Train: [24] [ 900/6250] eta: 0:24:38 lr: 0.000113 grad: 0.0807 (0.0801) loss: 0.8212 (0.8232) time: 0.2691 data: 0.0002 max mem: 26157 Train: [24] [1000/6250] eta: 0:24:07 lr: 0.000113 grad: 0.0757 (0.0798) loss: 0.8283 (0.8233) time: 0.2679 data: 0.0002 max mem: 26157 Train: [24] [1100/6250] eta: 0:23:37 lr: 0.000113 grad: 0.0736 (0.0797) loss: 0.8253 (0.8234) time: 0.2679 data: 0.0002 max mem: 26157 Train: [24] [1200/6250] eta: 0:23:06 lr: 0.000113 grad: 0.0763 (0.0796) loss: 0.8192 (0.8234) time: 0.2693 data: 0.0002 max mem: 26157 Train: [24] [1300/6250] eta: 0:22:37 lr: 0.000113 grad: 0.0770 (0.0795) loss: 0.8255 (0.8234) time: 0.2688 data: 0.0002 max mem: 26157 Train: [24] [1400/6250] eta: 0:22:08 lr: 0.000113 grad: 0.0787 (0.0793) loss: 0.8285 (0.8235) time: 0.2681 data: 0.0002 max mem: 26157 Train: [24] [1500/6250] eta: 0:21:39 lr: 0.000113 grad: 0.0744 (0.0793) loss: 0.8219 (0.8234) time: 0.2696 data: 0.0002 max mem: 26157 Train: [24] [1600/6250] eta: 0:21:13 lr: 0.000113 grad: 0.0802 (0.0795) loss: 0.8226 (0.8233) time: 0.3084 data: 0.0384 max mem: 26157 Train: [24] [1700/6250] eta: 0:20:44 lr: 0.000113 grad: 0.0776 (0.0799) loss: 0.8225 (0.8232) time: 0.2701 data: 0.0002 max mem: 26157 Train: [24] [1800/6250] eta: 0:20:33 lr: 0.000113 grad: 0.0772 (0.0799) loss: 0.8210 (0.8229) time: 0.6115 data: 0.3320 max mem: 26157 Train: [24] [1900/6250] eta: 0:20:04 lr: 0.000113 grad: 0.0791 (0.0799) loss: 0.8144 (0.8228) time: 0.2710 data: 0.0002 max mem: 26157 Train: [24] [2000/6250] eta: 0:19:35 lr: 0.000113 grad: 0.0746 (0.0799) loss: 0.8229 (0.8226) time: 0.2722 data: 0.0002 max mem: 26157 Train: [24] [2100/6250] eta: 0:19:11 lr: 0.000113 grad: 0.0749 (0.0799) loss: 0.8219 (0.8225) time: 0.2689 data: 0.0002 max mem: 26157 Train: [24] [2200/6250] eta: 0:19:00 lr: 0.000113 grad: 0.0782 (0.0799) loss: 0.8245 (0.8225) time: 0.7434 data: 0.4708 max mem: 26157 Train: [24] [2300/6250] eta: 0:18:30 lr: 0.000113 grad: 0.0775 (0.0800) loss: 0.8236 (0.8225) time: 0.2739 data: 0.0002 max mem: 26157 Train: [24] [2400/6250] eta: 0:18:01 lr: 0.000113 grad: 0.0758 (0.0800) loss: 0.8203 (0.8225) time: 0.2723 data: 0.0002 max mem: 26157 Train: [24] [2500/6250] eta: 0:17:31 lr: 0.000113 grad: 0.0769 (0.0801) loss: 0.8172 (0.8225) time: 0.2698 data: 0.0002 max mem: 26157 Train: [24] [2600/6250] eta: 0:17:02 lr: 0.000113 grad: 0.0791 (0.0802) loss: 0.8182 (0.8224) time: 0.2702 data: 0.0002 max mem: 26157 Train: [24] [2700/6250] eta: 0:16:32 lr: 0.000113 grad: 0.0784 (0.0802) loss: 0.8225 (0.8224) time: 0.2690 data: 0.0002 max mem: 26157 Train: [24] [2800/6250] eta: 0:16:05 lr: 0.000113 grad: 0.0815 (0.0804) loss: 0.8211 (0.8224) time: 0.2701 data: 0.0002 max mem: 26157 Train: [24] [2900/6250] eta: 0:15:36 lr: 0.000112 grad: 0.0804 (0.0805) loss: 0.8204 (0.8224) time: 0.2701 data: 0.0002 max mem: 26157 Train: [24] [3000/6250] eta: 0:15:07 lr: 0.000112 grad: 0.0812 (0.0807) loss: 0.8254 (0.8224) time: 0.2696 data: 0.0002 max mem: 26157 Train: [24] [3100/6250] eta: 0:14:38 lr: 0.000112 grad: 0.0823 (0.0811) loss: 0.8217 (0.8223) time: 0.2726 data: 0.0003 max mem: 26157 Train: [24] [3200/6250] eta: 0:14:09 lr: 0.000112 grad: 0.0809 (0.0812) loss: 0.8209 (0.8222) time: 0.2710 data: 0.0002 max mem: 26157 Train: [24] [3300/6250] eta: 0:13:41 lr: 0.000112 grad: 0.0823 (0.0813) loss: 0.8195 (0.8222) time: 0.2692 data: 0.0002 max mem: 26157 Train: [24] [3400/6250] eta: 0:13:12 lr: 0.000112 grad: 0.0840 (0.0813) loss: 0.8223 (0.8222) time: 0.2696 data: 0.0002 max mem: 26157 Train: [24] [3500/6250] eta: 0:12:44 lr: 0.000112 grad: 0.0815 (0.0814) loss: 0.8188 (0.8222) time: 0.2685 data: 0.0002 max mem: 26157 Train: [24] [3600/6250] eta: 0:12:15 lr: 0.000112 grad: 0.0803 (0.0814) loss: 0.8152 (0.8221) time: 0.2732 data: 0.0002 max mem: 26157 Train: [24] [3700/6250] eta: 0:11:47 lr: 0.000112 grad: 0.0805 (0.0814) loss: 0.8201 (0.8221) time: 0.2713 data: 0.0002 max mem: 26157 Train: [24] [3800/6250] eta: 0:11:19 lr: 0.000112 grad: 0.0812 (0.0815) loss: 0.8199 (0.8220) time: 0.2678 data: 0.0002 max mem: 26157 Train: [24] [3900/6250] eta: 0:10:51 lr: 0.000112 grad: 0.0789 (0.0815) loss: 0.8169 (0.8219) time: 0.2714 data: 0.0002 max mem: 26157 Train: [24] [4000/6250] eta: 0:10:22 lr: 0.000112 grad: 0.0777 (0.0815) loss: 0.8217 (0.8218) time: 0.2711 data: 0.0002 max mem: 26157 Train: [24] [4100/6250] eta: 0:09:59 lr: 0.000112 grad: 0.0787 (0.0815) loss: 0.8233 (0.8218) time: 0.2697 data: 0.0002 max mem: 26157 Train: [24] [4200/6250] eta: 0:09:34 lr: 0.000112 grad: 0.0850 (0.0815) loss: 0.8192 (0.8219) time: 0.2701 data: 0.0002 max mem: 26157 Train: [24] [4300/6250] eta: 0:09:05 lr: 0.000112 grad: 0.0788 (0.0815) loss: 0.8195 (0.8219) time: 0.2683 data: 0.0002 max mem: 26157 Train: [24] [4400/6250] eta: 0:08:39 lr: 0.000112 grad: 0.0783 (0.0816) loss: 0.8177 (0.8218) time: 0.5457 data: 0.2696 max mem: 26157 Train: [24] [4500/6250] eta: 0:08:11 lr: 0.000112 grad: 0.0799 (0.0816) loss: 0.8170 (0.8218) time: 0.2794 data: 0.0002 max mem: 26157 Train: [24] [4600/6250] eta: 0:07:46 lr: 0.000112 grad: 0.0814 (0.0816) loss: 0.8157 (0.8217) time: 0.2687 data: 0.0002 max mem: 26157 Train: [24] [4700/6250] eta: 0:07:20 lr: 0.000112 grad: 0.0778 (0.0816) loss: 0.8208 (0.8217) time: 0.5257 data: 0.2530 max mem: 26157 Train: [24] [4800/6250] eta: 0:06:51 lr: 0.000112 grad: 0.0799 (0.0817) loss: 0.8222 (0.8217) time: 0.2674 data: 0.0001 max mem: 26157 Train: [24] [4900/6250] eta: 0:06:22 lr: 0.000112 grad: 0.0761 (0.0817) loss: 0.8191 (0.8217) time: 0.2697 data: 0.0002 max mem: 26157 Train: [24] [5000/6250] eta: 0:05:53 lr: 0.000112 grad: 0.0762 (0.0816) loss: 0.8253 (0.8216) time: 0.2693 data: 0.0002 max mem: 26157 Train: [24] [5100/6250] eta: 0:05:25 lr: 0.000112 grad: 0.0830 (0.0817) loss: 0.8186 (0.8216) time: 0.2689 data: 0.0002 max mem: 26157 Train: [24] [5200/6250] eta: 0:04:56 lr: 0.000112 grad: 0.0833 (0.0817) loss: 0.8218 (0.8216) time: 0.2678 data: 0.0002 max mem: 26157 Train: [24] [5300/6250] eta: 0:04:28 lr: 0.000112 grad: 0.0811 (0.0817) loss: 0.8139 (0.8216) time: 0.2689 data: 0.0002 max mem: 26157 Train: [24] [5400/6250] eta: 0:03:59 lr: 0.000112 grad: 0.0810 (0.0817) loss: 0.8223 (0.8216) time: 0.2690 data: 0.0002 max mem: 26157 Train: [24] [5500/6250] eta: 0:03:31 lr: 0.000112 grad: 0.0776 (0.0818) loss: 0.8211 (0.8216) time: 0.2728 data: 0.0002 max mem: 26157 Train: [24] [5600/6250] eta: 0:03:03 lr: 0.000112 grad: 0.0849 (0.0818) loss: 0.8216 (0.8215) time: 0.2699 data: 0.0002 max mem: 26157 Train: [24] [5700/6250] eta: 0:02:34 lr: 0.000112 grad: 0.0804 (0.0818) loss: 0.8186 (0.8215) time: 0.2690 data: 0.0002 max mem: 26157 Train: [24] [5800/6250] eta: 0:02:06 lr: 0.000112 grad: 0.0776 (0.0819) loss: 0.8224 (0.8214) time: 0.2706 data: 0.0002 max mem: 26157 Train: [24] [5900/6250] eta: 0:01:38 lr: 0.000112 grad: 0.0830 (0.0820) loss: 0.8170 (0.8213) time: 0.2698 data: 0.0002 max mem: 26157 Train: [24] [6000/6250] eta: 0:01:10 lr: 0.000112 grad: 0.0804 (0.0820) loss: 0.8157 (0.8213) time: 0.3295 data: 0.0582 max mem: 26157 Train: [24] [6100/6250] eta: 0:00:42 lr: 0.000112 grad: 0.0858 (0.0821) loss: 0.8145 (0.8212) time: 0.2704 data: 0.0002 max mem: 26157 Train: [24] [6200/6250] eta: 0:00:14 lr: 0.000112 grad: 0.0800 (0.0821) loss: 0.8178 (0.8212) time: 0.2725 data: 0.0002 max mem: 26157 Train: [24] [6249/6250] eta: 0:00:00 lr: 0.000112 grad: 0.0796 (0.0821) loss: 0.8265 (0.8212) time: 0.2757 data: 0.0002 max mem: 26157 Train: [24] Total time: 0:29:27 (0.2828 s / it) Averaged stats: lr: 0.000112 grad: 0.0796 (0.0821) loss: 0.8265 (0.8212) Eval (hcp-train-subset): [24] [ 0/62] eta: 0:04:43 loss: 0.8434 (0.8434) time: 4.5731 data: 4.4613 max mem: 26157 Eval (hcp-train-subset): [24] [61/62] eta: 0:00:00 loss: 0.8328 (0.8344) time: 0.1488 data: 0.0655 max mem: 26157 Eval (hcp-train-subset): [24] Total time: 0:00:13 (0.2242 s / it) Averaged stats (hcp-train-subset): loss: 0.8328 (0.8344) Making plots (hcp-train-subset): example=38 Eval (hcp-val): [24] [ 0/62] eta: 0:03:48 loss: 0.8293 (0.8293) time: 3.6911 data: 3.5789 max mem: 26157 Eval (hcp-val): [24] [61/62] eta: 0:00:00 loss: 0.8304 (0.8314) time: 0.1499 data: 0.0669 max mem: 26157 Eval (hcp-val): [24] Total time: 0:00:13 (0.2211 s / it) Averaged stats (hcp-val): loss: 0.8304 (0.8314) Making plots (hcp-val): example=30 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [25] [ 0/6250] eta: 11:11:37 lr: 0.000112 grad: 0.0759 (0.0759) loss: 0.8620 (0.8620) time: 6.4476 data: 6.1699 max mem: 26157 Train: [25] [ 100/6250] eta: 0:34:10 lr: 0.000112 grad: 0.0700 (0.0826) loss: 0.8362 (0.8372) time: 0.2694 data: 0.0002 max mem: 26157 Train: [25] [ 200/6250] eta: 0:30:27 lr: 0.000112 grad: 0.0809 (0.0815) loss: 0.8305 (0.8355) time: 0.2701 data: 0.0002 max mem: 26157 Train: [25] [ 300/6250] eta: 0:28:54 lr: 0.000112 grad: 0.0768 (0.0813) loss: 0.8252 (0.8322) time: 0.2703 data: 0.0002 max mem: 26157 Train: [25] [ 400/6250] eta: 0:27:55 lr: 0.000112 grad: 0.0816 (0.0821) loss: 0.8188 (0.8292) time: 0.2711 data: 0.0002 max mem: 26157 Train: [25] [ 500/6250] eta: 0:27:12 lr: 0.000112 grad: 0.0810 (0.0828) loss: 0.8221 (0.8268) time: 0.2766 data: 0.0002 max mem: 26157 Train: [25] [ 600/6250] eta: 0:26:47 lr: 0.000112 grad: 0.0772 (0.0826) loss: 0.8213 (0.8256) time: 0.3537 data: 0.0816 max mem: 26157 Train: [25] [ 700/6250] eta: 0:26:08 lr: 0.000112 grad: 0.0797 (0.0822) loss: 0.8236 (0.8249) time: 0.2728 data: 0.0002 max mem: 26157 Train: [25] [ 800/6250] eta: 0:25:31 lr: 0.000112 grad: 0.0747 (0.0819) loss: 0.8255 (0.8245) time: 0.2683 data: 0.0002 max mem: 26157 Train: [25] [ 900/6250] eta: 0:24:59 lr: 0.000112 grad: 0.0791 (0.0817) loss: 0.8188 (0.8240) time: 0.2690 data: 0.0002 max mem: 26157 Train: [25] [1000/6250] eta: 0:24:26 lr: 0.000112 grad: 0.0812 (0.0818) loss: 0.8218 (0.8236) time: 0.2744 data: 0.0002 max mem: 26157 Train: [25] [1100/6250] eta: 0:23:54 lr: 0.000112 grad: 0.0753 (0.0817) loss: 0.8219 (0.8234) time: 0.2694 data: 0.0002 max mem: 26157 Train: [25] [1200/6250] eta: 0:23:22 lr: 0.000112 grad: 0.0774 (0.0816) loss: 0.8160 (0.8230) time: 0.2675 data: 0.0002 max mem: 26157 Train: [25] [1300/6250] eta: 0:22:51 lr: 0.000112 grad: 0.0797 (0.0819) loss: 0.8161 (0.8225) time: 0.2683 data: 0.0002 max mem: 26157 Train: [25] [1400/6250] eta: 0:22:21 lr: 0.000112 grad: 0.0757 (0.0822) loss: 0.8166 (0.8220) time: 0.2682 data: 0.0002 max mem: 26157 Train: [25] [1500/6250] eta: 0:21:50 lr: 0.000112 grad: 0.0772 (0.0822) loss: 0.8213 (0.8217) time: 0.2685 data: 0.0002 max mem: 26157 Train: [25] [1600/6250] eta: 0:21:21 lr: 0.000111 grad: 0.0811 (0.0824) loss: 0.8126 (0.8214) time: 0.2691 data: 0.0002 max mem: 26157 Train: [25] [1700/6250] eta: 0:20:52 lr: 0.000111 grad: 0.0847 (0.0826) loss: 0.8186 (0.8210) time: 0.2697 data: 0.0002 max mem: 26157 Train: [25] [1800/6250] eta: 0:20:23 lr: 0.000111 grad: 0.0850 (0.0826) loss: 0.8215 (0.8207) time: 0.2698 data: 0.0002 max mem: 26157 Train: [25] [1900/6250] eta: 0:19:54 lr: 0.000111 grad: 0.0790 (0.0827) loss: 0.8160 (0.8203) time: 0.2679 data: 0.0002 max mem: 26157 Train: [25] [2000/6250] eta: 0:19:25 lr: 0.000111 grad: 0.0815 (0.0827) loss: 0.8110 (0.8199) time: 0.2695 data: 0.0002 max mem: 26157 Train: [25] [2100/6250] eta: 0:18:57 lr: 0.000111 grad: 0.0831 (0.0829) loss: 0.8173 (0.8196) time: 0.2684 data: 0.0002 max mem: 26157 Train: [25] [2200/6250] eta: 0:18:29 lr: 0.000111 grad: 0.0770 (0.0830) loss: 0.8105 (0.8193) time: 0.2700 data: 0.0002 max mem: 26157 Train: [25] [2300/6250] eta: 0:18:01 lr: 0.000111 grad: 0.0800 (0.0832) loss: 0.8162 (0.8192) time: 0.2702 data: 0.0002 max mem: 26157 Train: [25] [2400/6250] eta: 0:17:33 lr: 0.000111 grad: 0.0743 (0.0831) loss: 0.8150 (0.8190) time: 0.2693 data: 0.0002 max mem: 26157 Train: [25] [2500/6250] eta: 0:17:05 lr: 0.000111 grad: 0.0822 (0.0831) loss: 0.8154 (0.8188) time: 0.2688 data: 0.0002 max mem: 26157 Train: [25] [2600/6250] eta: 0:16:37 lr: 0.000111 grad: 0.0806 (0.0831) loss: 0.8122 (0.8188) time: 0.2678 data: 0.0002 max mem: 26157 Train: [25] [2700/6250] eta: 0:16:09 lr: 0.000111 grad: 0.0819 (0.0832) loss: 0.8100 (0.8186) time: 0.2698 data: 0.0002 max mem: 26157 Train: [25] [2800/6250] eta: 0:15:41 lr: 0.000111 grad: 0.0857 (0.0832) loss: 0.8132 (0.8186) time: 0.2685 data: 0.0002 max mem: 26157 Train: [25] [2900/6250] eta: 0:15:13 lr: 0.000111 grad: 0.0834 (0.0833) loss: 0.8145 (0.8185) time: 0.2702 data: 0.0002 max mem: 26157 Train: [25] [3000/6250] eta: 0:14:46 lr: 0.000111 grad: 0.0845 (0.0835) loss: 0.8178 (0.8184) time: 0.2705 data: 0.0002 max mem: 26157 Train: [25] [3100/6250] eta: 0:14:18 lr: 0.000111 grad: 0.0806 (0.0836) loss: 0.8181 (0.8184) time: 0.2700 data: 0.0002 max mem: 26157 Train: [25] [3200/6250] eta: 0:13:51 lr: 0.000111 grad: 0.0845 (0.0837) loss: 0.8146 (0.8183) time: 0.2696 data: 0.0002 max mem: 26157 Train: [25] [3300/6250] eta: 0:13:24 lr: 0.000111 grad: 0.0790 (0.0836) loss: 0.8213 (0.8183) time: 0.2901 data: 0.0003 max mem: 26157 Train: [25] [3400/6250] eta: 0:12:57 lr: 0.000111 grad: 0.0847 (0.0836) loss: 0.8178 (0.8183) time: 0.2682 data: 0.0002 max mem: 26157 Train: [25] [3500/6250] eta: 0:12:29 lr: 0.000111 grad: 0.0800 (0.0837) loss: 0.8193 (0.8183) time: 0.2692 data: 0.0002 max mem: 26157 Train: [25] [3600/6250] eta: 0:12:02 lr: 0.000111 grad: 0.0850 (0.0838) loss: 0.8120 (0.8182) time: 0.2720 data: 0.0002 max mem: 26157 Train: [25] [3700/6250] eta: 0:11:36 lr: 0.000111 grad: 0.0810 (0.0838) loss: 0.8217 (0.8182) time: 0.2694 data: 0.0002 max mem: 26157 Train: [25] [3800/6250] eta: 0:11:08 lr: 0.000111 grad: 0.0797 (0.0838) loss: 0.8206 (0.8181) time: 0.2694 data: 0.0002 max mem: 26157 Train: [25] [3900/6250] eta: 0:10:42 lr: 0.000111 grad: 0.0805 (0.0837) loss: 0.8206 (0.8181) time: 0.2709 data: 0.0002 max mem: 26157 Train: [25] [4000/6250] eta: 0:10:14 lr: 0.000111 grad: 0.0806 (0.0837) loss: 0.8156 (0.8180) time: 0.2678 data: 0.0002 max mem: 26157 Train: [25] [4100/6250] eta: 0:09:47 lr: 0.000111 grad: 0.0838 (0.0838) loss: 0.8150 (0.8180) time: 0.2711 data: 0.0002 max mem: 26157 Train: [25] [4200/6250] eta: 0:09:19 lr: 0.000111 grad: 0.0880 (0.0839) loss: 0.8182 (0.8179) time: 0.2697 data: 0.0002 max mem: 26157 Train: [25] [4300/6250] eta: 0:08:52 lr: 0.000111 grad: 0.0886 (0.0840) loss: 0.8145 (0.8178) time: 0.2699 data: 0.0002 max mem: 26157 Train: [25] [4400/6250] eta: 0:08:25 lr: 0.000111 grad: 0.0816 (0.0840) loss: 0.8143 (0.8177) time: 0.2781 data: 0.0002 max mem: 26157 Train: [25] [4500/6250] eta: 0:07:57 lr: 0.000111 grad: 0.0829 (0.0842) loss: 0.8118 (0.8176) time: 0.2689 data: 0.0002 max mem: 26157 Train: [25] [4600/6250] eta: 0:07:30 lr: 0.000111 grad: 0.0822 (0.0842) loss: 0.8184 (0.8175) time: 0.2682 data: 0.0002 max mem: 26157 Train: [25] [4700/6250] eta: 0:07:03 lr: 0.000111 grad: 0.0828 (0.0843) loss: 0.8179 (0.8174) time: 0.2681 data: 0.0002 max mem: 26157 Train: [25] [4800/6250] eta: 0:06:35 lr: 0.000111 grad: 0.0813 (0.0843) loss: 0.8195 (0.8174) time: 0.2768 data: 0.0002 max mem: 26157 Train: [25] [4900/6250] eta: 0:06:08 lr: 0.000111 grad: 0.0877 (0.0843) loss: 0.8191 (0.8174) time: 0.2695 data: 0.0002 max mem: 26157 Train: [25] [5000/6250] eta: 0:05:40 lr: 0.000111 grad: 0.0809 (0.0843) loss: 0.8220 (0.8174) time: 0.2701 data: 0.0001 max mem: 26157 Train: [25] [5100/6250] eta: 0:05:13 lr: 0.000111 grad: 0.0786 (0.0843) loss: 0.8115 (0.8174) time: 0.2714 data: 0.0002 max mem: 26157 Train: [25] [5200/6250] eta: 0:04:46 lr: 0.000111 grad: 0.0787 (0.0843) loss: 0.8270 (0.8175) time: 0.2710 data: 0.0002 max mem: 26157 Train: [25] [5300/6250] eta: 0:04:18 lr: 0.000111 grad: 0.0795 (0.0843) loss: 0.8199 (0.8175) time: 0.2701 data: 0.0002 max mem: 26157 Train: [25] [5400/6250] eta: 0:03:51 lr: 0.000111 grad: 0.0845 (0.0843) loss: 0.8174 (0.8176) time: 0.2716 data: 0.0002 max mem: 26157 Train: [25] [5500/6250] eta: 0:03:24 lr: 0.000111 grad: 0.0876 (0.0844) loss: 0.8195 (0.8176) time: 0.2708 data: 0.0002 max mem: 26157 Train: [25] [5600/6250] eta: 0:02:57 lr: 0.000111 grad: 0.0818 (0.0844) loss: 0.8193 (0.8176) time: 0.2695 data: 0.0003 max mem: 26157 Train: [25] [5700/6250] eta: 0:02:29 lr: 0.000111 grad: 0.0803 (0.0844) loss: 0.8206 (0.8177) time: 0.2699 data: 0.0002 max mem: 26157 Train: [25] [5800/6250] eta: 0:02:02 lr: 0.000111 grad: 0.0807 (0.0844) loss: 0.8192 (0.8177) time: 0.2694 data: 0.0002 max mem: 26157 Train: [25] [5900/6250] eta: 0:01:35 lr: 0.000111 grad: 0.0805 (0.0844) loss: 0.8207 (0.8177) time: 0.2692 data: 0.0002 max mem: 26157 Train: [25] [6000/6250] eta: 0:01:08 lr: 0.000111 grad: 0.0836 (0.0843) loss: 0.8220 (0.8178) time: 0.2712 data: 0.0002 max mem: 26157 Train: [25] [6100/6250] eta: 0:00:40 lr: 0.000111 grad: 0.0874 (0.0844) loss: 0.8196 (0.8178) time: 0.4334 data: 0.1577 max mem: 26157 Train: [25] [6200/6250] eta: 0:00:13 lr: 0.000111 grad: 0.0859 (0.0844) loss: 0.8114 (0.8178) time: 0.2701 data: 0.0002 max mem: 26157 Train: [25] [6249/6250] eta: 0:00:00 lr: 0.000111 grad: 0.0800 (0.0844) loss: 0.8155 (0.8178) time: 0.2710 data: 0.0002 max mem: 26157 Train: [25] Total time: 0:28:31 (0.2738 s / it) Averaged stats: lr: 0.000111 grad: 0.0800 (0.0844) loss: 0.8155 (0.8178) Eval (hcp-train-subset): [25] [ 0/62] eta: 0:04:51 loss: 0.8455 (0.8455) time: 4.7002 data: 4.6158 max mem: 26157 Eval (hcp-train-subset): [25] [61/62] eta: 0:00:00 loss: 0.8333 (0.8339) time: 0.1266 data: 0.0435 max mem: 26157 Eval (hcp-train-subset): [25] Total time: 0:00:12 (0.2049 s / it) Averaged stats (hcp-train-subset): loss: 0.8333 (0.8339) Making plots (hcp-train-subset): example=35 Eval (hcp-val): [25] [ 0/62] eta: 0:03:24 loss: 0.8251 (0.8251) time: 3.2909 data: 3.1760 max mem: 26157 Eval (hcp-val): [25] [61/62] eta: 0:00:00 loss: 0.8300 (0.8306) time: 0.1326 data: 0.0500 max mem: 26157 Eval (hcp-val): [25] Total time: 0:00:12 (0.2066 s / it) Averaged stats (hcp-val): loss: 0.8300 (0.8306) Making plots (hcp-val): example=26 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [26] [ 0/6250] eta: 7:04:30 lr: 0.000111 grad: 0.0570 (0.0570) loss: 0.8668 (0.8668) time: 4.0753 data: 3.7289 max mem: 26157 Train: [26] [ 100/6250] eta: 0:34:25 lr: 0.000111 grad: 0.0733 (0.0855) loss: 0.8353 (0.8347) time: 0.2711 data: 0.0002 max mem: 26157 Train: [26] [ 200/6250] eta: 0:30:31 lr: 0.000110 grad: 0.0735 (0.0875) loss: 0.8247 (0.8286) time: 0.2700 data: 0.0002 max mem: 26157 Train: [26] [ 300/6250] eta: 0:28:55 lr: 0.000110 grad: 0.0908 (0.0884) loss: 0.8255 (0.8256) time: 0.2690 data: 0.0002 max mem: 26157 Train: [26] [ 400/6250] eta: 0:27:54 lr: 0.000110 grad: 0.0742 (0.0868) loss: 0.8291 (0.8249) time: 0.2698 data: 0.0002 max mem: 26157 Train: [26] [ 500/6250] eta: 0:27:09 lr: 0.000110 grad: 0.0809 (0.0859) loss: 0.8175 (0.8242) time: 0.2734 data: 0.0002 max mem: 26157 Train: [26] [ 600/6250] eta: 0:26:28 lr: 0.000110 grad: 0.0747 (0.0847) loss: 0.8197 (0.8243) time: 0.2683 data: 0.0002 max mem: 26157 Train: [26] [ 700/6250] eta: 0:25:50 lr: 0.000110 grad: 0.0766 (0.0840) loss: 0.8178 (0.8241) time: 0.2691 data: 0.0002 max mem: 26157 Train: [26] [ 800/6250] eta: 0:25:16 lr: 0.000110 grad: 0.0750 (0.0836) loss: 0.8262 (0.8239) time: 0.2718 data: 0.0002 max mem: 26157 Train: [26] [ 900/6250] eta: 0:24:44 lr: 0.000110 grad: 0.0777 (0.0832) loss: 0.8272 (0.8237) time: 0.2726 data: 0.0002 max mem: 26157 Train: [26] [1000/6250] eta: 0:24:12 lr: 0.000110 grad: 0.0811 (0.0829) loss: 0.8210 (0.8236) time: 0.2693 data: 0.0002 max mem: 26157 Train: [26] [1100/6250] eta: 0:23:41 lr: 0.000110 grad: 0.0796 (0.0828) loss: 0.8192 (0.8234) time: 0.2684 data: 0.0002 max mem: 26157 Train: [26] [1200/6250] eta: 0:23:11 lr: 0.000110 grad: 0.0812 (0.0826) loss: 0.8136 (0.8231) time: 0.2693 data: 0.0002 max mem: 26157 Train: [26] [1300/6250] eta: 0:22:41 lr: 0.000110 grad: 0.0837 (0.0827) loss: 0.8170 (0.8228) time: 0.2700 data: 0.0002 max mem: 26157 Train: [26] [1400/6250] eta: 0:22:12 lr: 0.000110 grad: 0.0853 (0.0830) loss: 0.8166 (0.8225) time: 0.2687 data: 0.0002 max mem: 26157 Train: [26] [1500/6250] eta: 0:21:43 lr: 0.000110 grad: 0.0805 (0.0830) loss: 0.8153 (0.8221) time: 0.2693 data: 0.0002 max mem: 26157 Train: [26] [1600/6250] eta: 0:21:14 lr: 0.000110 grad: 0.0838 (0.0831) loss: 0.8175 (0.8219) time: 0.2697 data: 0.0001 max mem: 26157 Train: [26] [1700/6250] eta: 0:20:51 lr: 0.000110 grad: 0.0792 (0.0832) loss: 0.8173 (0.8216) time: 0.3850 data: 0.1146 max mem: 26157 Train: [26] [1800/6250] eta: 0:20:22 lr: 0.000110 grad: 0.0877 (0.0834) loss: 0.8173 (0.8213) time: 0.2689 data: 0.0002 max mem: 26157 Train: [26] [1900/6250] eta: 0:19:54 lr: 0.000110 grad: 0.0860 (0.0835) loss: 0.8248 (0.8212) time: 0.2700 data: 0.0002 max mem: 26157 Train: [26] [2000/6250] eta: 0:19:25 lr: 0.000110 grad: 0.0843 (0.0837) loss: 0.8201 (0.8209) time: 0.2687 data: 0.0002 max mem: 26157 Train: [26] [2100/6250] eta: 0:18:57 lr: 0.000110 grad: 0.0822 (0.0837) loss: 0.8198 (0.8208) time: 0.2687 data: 0.0001 max mem: 26157 Train: [26] [2200/6250] eta: 0:18:30 lr: 0.000110 grad: 0.0849 (0.0839) loss: 0.8180 (0.8208) time: 0.2714 data: 0.0002 max mem: 26157 Train: [26] [2300/6250] eta: 0:18:02 lr: 0.000110 grad: 0.0793 (0.0839) loss: 0.8255 (0.8208) time: 0.2692 data: 0.0002 max mem: 26157 Train: [26] [2400/6250] eta: 0:17:33 lr: 0.000110 grad: 0.0867 (0.0841) loss: 0.8189 (0.8207) time: 0.2701 data: 0.0002 max mem: 26157 Train: [26] [2500/6250] eta: 0:17:05 lr: 0.000110 grad: 0.0816 (0.0841) loss: 0.8243 (0.8207) time: 0.2688 data: 0.0002 max mem: 26157 Train: [26] [2600/6250] eta: 0:16:37 lr: 0.000110 grad: 0.0822 (0.0841) loss: 0.8167 (0.8207) time: 0.2683 data: 0.0001 max mem: 26157 Train: [26] [2700/6250] eta: 0:16:09 lr: 0.000110 grad: 0.0802 (0.0842) loss: 0.8237 (0.8207) time: 0.2679 data: 0.0001 max mem: 26157 Train: [26] [2800/6250] eta: 0:15:41 lr: 0.000110 grad: 0.0772 (0.0842) loss: 0.8218 (0.8207) time: 0.2681 data: 0.0002 max mem: 26157 Train: [26] [2900/6250] eta: 0:15:14 lr: 0.000110 grad: 0.0859 (0.0842) loss: 0.8229 (0.8207) time: 0.2693 data: 0.0002 max mem: 26157 Train: [26] [3000/6250] eta: 0:14:46 lr: 0.000110 grad: 0.0858 (0.0842) loss: 0.8214 (0.8207) time: 0.2699 data: 0.0002 max mem: 26157 Train: [26] [3100/6250] eta: 0:14:19 lr: 0.000110 grad: 0.0786 (0.0841) loss: 0.8188 (0.8208) time: 0.2712 data: 0.0002 max mem: 26157 Train: [26] [3200/6250] eta: 0:13:51 lr: 0.000110 grad: 0.0800 (0.0841) loss: 0.8147 (0.8207) time: 0.2677 data: 0.0002 max mem: 26157 Train: [26] [3300/6250] eta: 0:13:24 lr: 0.000110 grad: 0.0769 (0.0840) loss: 0.8213 (0.8208) time: 0.2713 data: 0.0002 max mem: 26157 Train: [26] [3400/6250] eta: 0:12:58 lr: 0.000110 grad: 0.0833 (0.0840) loss: 0.8188 (0.8207) time: 0.2716 data: 0.0002 max mem: 26157 Train: [26] [3500/6250] eta: 0:12:31 lr: 0.000110 grad: 0.0873 (0.0840) loss: 0.8120 (0.8206) time: 0.2694 data: 0.0002 max mem: 26157 Train: [26] [3600/6250] eta: 0:12:03 lr: 0.000110 grad: 0.0844 (0.0841) loss: 0.8213 (0.8205) time: 0.2679 data: 0.0002 max mem: 26157 Train: [26] [3700/6250] eta: 0:11:36 lr: 0.000110 grad: 0.0805 (0.0841) loss: 0.8197 (0.8205) time: 0.2706 data: 0.0002 max mem: 26157 Train: [26] [3800/6250] eta: 0:11:08 lr: 0.000110 grad: 0.0803 (0.0841) loss: 0.8149 (0.8204) time: 0.2698 data: 0.0002 max mem: 26157 Train: [26] [3900/6250] eta: 0:10:40 lr: 0.000110 grad: 0.0824 (0.0840) loss: 0.8234 (0.8203) time: 0.2690 data: 0.0002 max mem: 26157 Train: [26] [4000/6250] eta: 0:10:13 lr: 0.000110 grad: 0.0831 (0.0841) loss: 0.8193 (0.8203) time: 0.2682 data: 0.0002 max mem: 26157 Train: [26] [4100/6250] eta: 0:09:46 lr: 0.000110 grad: 0.0778 (0.0841) loss: 0.8243 (0.8204) time: 0.2687 data: 0.0002 max mem: 26157 Train: [26] [4200/6250] eta: 0:09:18 lr: 0.000110 grad: 0.0856 (0.0840) loss: 0.8155 (0.8203) time: 0.2685 data: 0.0002 max mem: 26157 Train: [26] [4300/6250] eta: 0:08:51 lr: 0.000110 grad: 0.0832 (0.0841) loss: 0.8213 (0.8203) time: 0.2700 data: 0.0002 max mem: 26157 Train: [26] [4400/6250] eta: 0:08:23 lr: 0.000110 grad: 0.0792 (0.0840) loss: 0.8224 (0.8203) time: 0.2696 data: 0.0003 max mem: 26157 Train: [26] [4500/6250] eta: 0:07:56 lr: 0.000110 grad: 0.0860 (0.0841) loss: 0.8127 (0.8202) time: 0.2706 data: 0.0002 max mem: 26157 Train: [26] [4600/6250] eta: 0:07:29 lr: 0.000110 grad: 0.0835 (0.0841) loss: 0.8185 (0.8202) time: 0.2695 data: 0.0003 max mem: 26157 Train: [26] [4700/6250] eta: 0:07:01 lr: 0.000110 grad: 0.0799 (0.0841) loss: 0.8155 (0.8201) time: 0.2678 data: 0.0002 max mem: 26157 Train: [26] [4800/6250] eta: 0:06:34 lr: 0.000109 grad: 0.0824 (0.0841) loss: 0.8190 (0.8201) time: 0.2698 data: 0.0002 max mem: 26157 Train: [26] [4900/6250] eta: 0:06:07 lr: 0.000109 grad: 0.0777 (0.0840) loss: 0.8185 (0.8201) time: 0.2692 data: 0.0002 max mem: 26157 Train: [26] [5000/6250] eta: 0:05:40 lr: 0.000109 grad: 0.0812 (0.0840) loss: 0.8217 (0.8200) time: 0.2697 data: 0.0002 max mem: 26157 Train: [26] [5100/6250] eta: 0:05:12 lr: 0.000109 grad: 0.0802 (0.0840) loss: 0.8145 (0.8200) time: 0.2701 data: 0.0002 max mem: 26157 Train: [26] [5200/6250] eta: 0:04:45 lr: 0.000109 grad: 0.0774 (0.0840) loss: 0.8232 (0.8200) time: 0.2706 data: 0.0002 max mem: 26157 Train: [26] [5300/6250] eta: 0:04:18 lr: 0.000109 grad: 0.0847 (0.0840) loss: 0.8187 (0.8199) time: 0.2703 data: 0.0002 max mem: 26157 Train: [26] [5400/6250] eta: 0:03:52 lr: 0.000109 grad: 0.0763 (0.0840) loss: 0.8178 (0.8199) time: 0.2723 data: 0.0002 max mem: 26157 Train: [26] [5500/6250] eta: 0:03:24 lr: 0.000109 grad: 0.0826 (0.0840) loss: 0.8118 (0.8199) time: 0.2692 data: 0.0002 max mem: 26157 Train: [26] [5600/6250] eta: 0:02:57 lr: 0.000109 grad: 0.0817 (0.0840) loss: 0.8167 (0.8199) time: 0.2964 data: 0.0261 max mem: 26157 Train: [26] [5700/6250] eta: 0:02:30 lr: 0.000109 grad: 0.0789 (0.0840) loss: 0.8207 (0.8198) time: 0.2688 data: 0.0002 max mem: 26157 Train: [26] [5800/6250] eta: 0:02:02 lr: 0.000109 grad: 0.0800 (0.0839) loss: 0.8178 (0.8198) time: 0.2684 data: 0.0002 max mem: 26157 Train: [26] [5900/6250] eta: 0:01:35 lr: 0.000109 grad: 0.0791 (0.0840) loss: 0.8163 (0.8198) time: 0.2739 data: 0.0002 max mem: 26157 Train: [26] [6000/6250] eta: 0:01:08 lr: 0.000109 grad: 0.0847 (0.0841) loss: 0.8193 (0.8198) time: 0.2683 data: 0.0002 max mem: 26157 Train: [26] [6100/6250] eta: 0:00:40 lr: 0.000109 grad: 0.0826 (0.0842) loss: 0.8165 (0.8197) time: 0.2684 data: 0.0004 max mem: 26157 Train: [26] [6200/6250] eta: 0:00:13 lr: 0.000109 grad: 0.0862 (0.0843) loss: 0.8149 (0.8197) time: 0.2704 data: 0.0002 max mem: 26157 Train: [26] [6249/6250] eta: 0:00:00 lr: 0.000109 grad: 0.0800 (0.0843) loss: 0.8162 (0.8196) time: 0.2718 data: 0.0002 max mem: 26157 Train: [26] Total time: 0:28:32 (0.2740 s / it) Averaged stats: lr: 0.000109 grad: 0.0800 (0.0843) loss: 0.8162 (0.8196) Eval (hcp-train-subset): [26] [ 0/62] eta: 0:03:08 loss: 0.8478 (0.8478) time: 3.0373 data: 2.9162 max mem: 26157 Eval (hcp-train-subset): [26] [61/62] eta: 0:00:00 loss: 0.8337 (0.8338) time: 0.1320 data: 0.0484 max mem: 26157 Eval (hcp-train-subset): [26] Total time: 0:00:12 (0.2011 s / it) Averaged stats (hcp-train-subset): loss: 0.8337 (0.8338) Making plots (hcp-train-subset): example=11 Eval (hcp-val): [26] [ 0/62] eta: 0:03:25 loss: 0.8255 (0.8255) time: 3.3118 data: 3.2165 max mem: 26157 Eval (hcp-val): [26] [61/62] eta: 0:00:00 loss: 0.8299 (0.8307) time: 0.1092 data: 0.0261 max mem: 26157 Eval (hcp-val): [26] Total time: 0:00:12 (0.1999 s / it) Averaged stats (hcp-val): loss: 0.8299 (0.8307) Making plots (hcp-val): example=7 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [27] [ 0/6250] eta: 8:34:57 lr: 0.000109 grad: 0.2265 (0.2265) loss: 0.8354 (0.8354) time: 4.9436 data: 4.5906 max mem: 26157 Train: [27] [ 100/6250] eta: 0:33:21 lr: 0.000109 grad: 0.0912 (0.1068) loss: 0.8168 (0.8177) time: 0.2690 data: 0.0003 max mem: 26157 Train: [27] [ 200/6250] eta: 0:29:58 lr: 0.000109 grad: 0.0820 (0.0993) loss: 0.8213 (0.8150) time: 0.2698 data: 0.0002 max mem: 26157 Train: [27] [ 300/6250] eta: 0:28:34 lr: 0.000109 grad: 0.0822 (0.0946) loss: 0.8201 (0.8151) time: 0.2721 data: 0.0002 max mem: 26157 Train: [27] [ 400/6250] eta: 0:27:40 lr: 0.000109 grad: 0.0934 (0.0928) loss: 0.8114 (0.8153) time: 0.2702 data: 0.0002 max mem: 26157 Train: [27] [ 500/6250] eta: 0:26:58 lr: 0.000109 grad: 0.0821 (0.0915) loss: 0.8076 (0.8154) time: 0.2700 data: 0.0002 max mem: 26157 Train: [27] [ 600/6250] eta: 0:26:19 lr: 0.000109 grad: 0.0796 (0.0903) loss: 0.8239 (0.8156) time: 0.2700 data: 0.0002 max mem: 26157 Train: [27] [ 700/6250] eta: 0:25:44 lr: 0.000109 grad: 0.0805 (0.0892) loss: 0.8216 (0.8159) time: 0.2693 data: 0.0002 max mem: 26157 Train: [27] [ 800/6250] eta: 0:25:12 lr: 0.000109 grad: 0.0764 (0.0882) loss: 0.8238 (0.8162) time: 0.2773 data: 0.0002 max mem: 26157 Train: [27] [ 900/6250] eta: 0:24:52 lr: 0.000109 grad: 0.0813 (0.0874) loss: 0.8175 (0.8166) time: 0.2703 data: 0.0002 max mem: 26157 Train: [27] [1000/6250] eta: 0:24:33 lr: 0.000109 grad: 0.0824 (0.0871) loss: 0.8213 (0.8169) time: 0.2693 data: 0.0002 max mem: 26157 Train: [27] [1100/6250] eta: 0:24:00 lr: 0.000109 grad: 0.0765 (0.0865) loss: 0.8196 (0.8172) time: 0.2682 data: 0.0002 max mem: 26157 Train: [27] [1200/6250] eta: 0:23:27 lr: 0.000109 grad: 0.0770 (0.0860) loss: 0.8176 (0.8174) time: 0.2689 data: 0.0003 max mem: 26157 Train: [27] [1300/6250] eta: 0:22:55 lr: 0.000109 grad: 0.0856 (0.0858) loss: 0.8137 (0.8173) time: 0.2682 data: 0.0002 max mem: 26157 Train: [27] [1400/6250] eta: 0:22:25 lr: 0.000109 grad: 0.0849 (0.0858) loss: 0.8122 (0.8172) time: 0.2693 data: 0.0002 max mem: 26157 Train: [27] [1500/6250] eta: 0:21:54 lr: 0.000109 grad: 0.0815 (0.0858) loss: 0.8137 (0.8170) time: 0.2688 data: 0.0002 max mem: 26157 Train: [27] [1600/6250] eta: 0:21:24 lr: 0.000109 grad: 0.0822 (0.0859) loss: 0.8136 (0.8167) time: 0.2686 data: 0.0002 max mem: 26157 Train: [27] [1700/6250] eta: 0:20:55 lr: 0.000109 grad: 0.0849 (0.0860) loss: 0.8114 (0.8165) time: 0.2696 data: 0.0002 max mem: 26157 Train: [27] [1800/6250] eta: 0:20:26 lr: 0.000109 grad: 0.0863 (0.0862) loss: 0.8121 (0.8162) time: 0.2694 data: 0.0002 max mem: 26157 Train: [27] [1900/6250] eta: 0:19:57 lr: 0.000109 grad: 0.0966 (0.0866) loss: 0.8101 (0.8159) time: 0.2697 data: 0.0002 max mem: 26157 Train: [27] [2000/6250] eta: 0:19:28 lr: 0.000109 grad: 0.0871 (0.0866) loss: 0.8145 (0.8156) time: 0.2690 data: 0.0002 max mem: 26157 Train: [27] [2100/6250] eta: 0:19:00 lr: 0.000109 grad: 0.0886 (0.0867) loss: 0.8095 (0.8154) time: 0.2713 data: 0.0002 max mem: 26157 Train: [27] [2200/6250] eta: 0:18:32 lr: 0.000109 grad: 0.0852 (0.0869) loss: 0.8099 (0.8151) time: 0.2707 data: 0.0002 max mem: 26157 Train: [27] [2300/6250] eta: 0:18:03 lr: 0.000109 grad: 0.0812 (0.0869) loss: 0.8145 (0.8149) time: 0.2699 data: 0.0002 max mem: 26157 Train: [27] [2400/6250] eta: 0:17:35 lr: 0.000109 grad: 0.0883 (0.0871) loss: 0.8067 (0.8148) time: 0.2691 data: 0.0002 max mem: 26157 Train: [27] [2500/6250] eta: 0:17:07 lr: 0.000109 grad: 0.0823 (0.0871) loss: 0.8137 (0.8148) time: 0.2696 data: 0.0002 max mem: 26157 Train: [27] [2600/6250] eta: 0:16:39 lr: 0.000109 grad: 0.0871 (0.0872) loss: 0.8137 (0.8146) time: 0.2699 data: 0.0002 max mem: 26157 Train: [27] [2700/6250] eta: 0:16:11 lr: 0.000109 grad: 0.0856 (0.0872) loss: 0.8098 (0.8145) time: 0.2697 data: 0.0002 max mem: 26157 Train: [27] [2800/6250] eta: 0:15:44 lr: 0.000109 grad: 0.0864 (0.0873) loss: 0.8113 (0.8144) time: 0.2689 data: 0.0002 max mem: 26157 Train: [27] [2900/6250] eta: 0:15:16 lr: 0.000109 grad: 0.0823 (0.0874) loss: 0.8136 (0.8143) time: 0.2727 data: 0.0002 max mem: 26157 Train: [27] [3000/6250] eta: 0:14:48 lr: 0.000109 grad: 0.0845 (0.0873) loss: 0.8132 (0.8143) time: 0.2703 data: 0.0002 max mem: 26157 Train: [27] [3100/6250] eta: 0:14:20 lr: 0.000108 grad: 0.0845 (0.0874) loss: 0.8133 (0.8143) time: 0.2695 data: 0.0002 max mem: 26157 Train: [27] [3200/6250] eta: 0:13:53 lr: 0.000108 grad: 0.0869 (0.0874) loss: 0.8150 (0.8143) time: 0.2678 data: 0.0001 max mem: 26157 Train: [27] [3300/6250] eta: 0:13:25 lr: 0.000108 grad: 0.0853 (0.0874) loss: 0.8080 (0.8142) time: 0.2704 data: 0.0002 max mem: 26157 Train: [27] [3400/6250] eta: 0:12:57 lr: 0.000108 grad: 0.0879 (0.0874) loss: 0.8144 (0.8142) time: 0.2685 data: 0.0002 max mem: 26157 Train: [27] [3500/6250] eta: 0:12:30 lr: 0.000108 grad: 0.0869 (0.0874) loss: 0.8130 (0.8141) time: 0.2685 data: 0.0002 max mem: 26157 Train: [27] [3600/6250] eta: 0:12:02 lr: 0.000108 grad: 0.0888 (0.0874) loss: 0.8127 (0.8141) time: 0.2700 data: 0.0002 max mem: 26157 Train: [27] [3700/6250] eta: 0:11:35 lr: 0.000108 grad: 0.0832 (0.0874) loss: 0.8174 (0.8142) time: 0.2689 data: 0.0002 max mem: 26157 Train: [27] [3800/6250] eta: 0:11:10 lr: 0.000108 grad: 0.0855 (0.0874) loss: 0.8166 (0.8142) time: 0.3347 data: 0.0002 max mem: 26157 Train: [27] [3900/6250] eta: 0:10:43 lr: 0.000108 grad: 0.0867 (0.0873) loss: 0.8136 (0.8143) time: 0.2721 data: 0.0003 max mem: 26157 Train: [27] [4000/6250] eta: 0:10:15 lr: 0.000108 grad: 0.0845 (0.0874) loss: 0.8195 (0.8144) time: 0.2694 data: 0.0002 max mem: 26157 Train: [27] [4100/6250] eta: 0:09:48 lr: 0.000108 grad: 0.0757 (0.0873) loss: 0.8147 (0.8145) time: 0.2690 data: 0.0002 max mem: 26157 Train: [27] [4200/6250] eta: 0:09:20 lr: 0.000108 grad: 0.0852 (0.0872) loss: 0.8155 (0.8145) time: 0.2688 data: 0.0002 max mem: 26157 Train: [27] [4300/6250] eta: 0:08:53 lr: 0.000108 grad: 0.0866 (0.0872) loss: 0.8161 (0.8146) time: 0.2719 data: 0.0002 max mem: 26157 Train: [27] [4400/6250] eta: 0:08:25 lr: 0.000108 grad: 0.0916 (0.0872) loss: 0.8102 (0.8146) time: 0.2702 data: 0.0002 max mem: 26157 Train: [27] [4500/6250] eta: 0:07:58 lr: 0.000108 grad: 0.0845 (0.0872) loss: 0.8155 (0.8147) time: 0.2688 data: 0.0002 max mem: 26157 Train: [27] [4600/6250] eta: 0:07:30 lr: 0.000108 grad: 0.0933 (0.0872) loss: 0.8151 (0.8147) time: 0.2703 data: 0.0002 max mem: 26157 Train: [27] [4700/6250] eta: 0:07:03 lr: 0.000108 grad: 0.0886 (0.0873) loss: 0.8178 (0.8147) time: 0.2689 data: 0.0002 max mem: 26157 Train: [27] [4800/6250] eta: 0:06:35 lr: 0.000108 grad: 0.0858 (0.0874) loss: 0.8168 (0.8148) time: 0.2696 data: 0.0002 max mem: 26157 Train: [27] [4900/6250] eta: 0:06:08 lr: 0.000108 grad: 0.0826 (0.0874) loss: 0.8199 (0.8148) time: 0.2703 data: 0.0002 max mem: 26157 Train: [27] [5000/6250] eta: 0:05:41 lr: 0.000108 grad: 0.0929 (0.0874) loss: 0.8084 (0.8148) time: 0.2692 data: 0.0002 max mem: 26157 Train: [27] [5100/6250] eta: 0:05:13 lr: 0.000108 grad: 0.0866 (0.0875) loss: 0.8100 (0.8148) time: 0.2685 data: 0.0002 max mem: 26157 Train: [27] [5200/6250] eta: 0:04:46 lr: 0.000108 grad: 0.0837 (0.0875) loss: 0.8158 (0.8147) time: 0.2693 data: 0.0002 max mem: 26157 Train: [27] [5300/6250] eta: 0:04:19 lr: 0.000108 grad: 0.0844 (0.0875) loss: 0.8115 (0.8147) time: 0.2699 data: 0.0002 max mem: 26157 Train: [27] [5400/6250] eta: 0:03:51 lr: 0.000108 grad: 0.0864 (0.0875) loss: 0.8129 (0.8147) time: 0.2690 data: 0.0002 max mem: 26157 Train: [27] [5500/6250] eta: 0:03:24 lr: 0.000108 grad: 0.0900 (0.0875) loss: 0.8143 (0.8147) time: 0.2684 data: 0.0002 max mem: 26157 Train: [27] [5600/6250] eta: 0:02:57 lr: 0.000108 grad: 0.0829 (0.0876) loss: 0.8144 (0.8147) time: 0.2686 data: 0.0002 max mem: 26157 Train: [27] [5700/6250] eta: 0:02:29 lr: 0.000108 grad: 0.0851 (0.0876) loss: 0.8153 (0.8147) time: 0.2684 data: 0.0002 max mem: 26157 Train: [27] [5800/6250] eta: 0:02:02 lr: 0.000108 grad: 0.0909 (0.0876) loss: 0.8111 (0.8147) time: 0.2698 data: 0.0002 max mem: 26157 Train: [27] [5900/6250] eta: 0:01:35 lr: 0.000108 grad: 0.0821 (0.0876) loss: 0.8126 (0.8147) time: 0.2678 data: 0.0002 max mem: 26157 Train: [27] [6000/6250] eta: 0:01:08 lr: 0.000108 grad: 0.0802 (0.0876) loss: 0.8122 (0.8147) time: 0.2690 data: 0.0002 max mem: 26157 Train: [27] [6100/6250] eta: 0:00:40 lr: 0.000108 grad: 0.0818 (0.0875) loss: 0.8185 (0.8148) time: 0.2718 data: 0.0002 max mem: 26157 Train: [27] [6200/6250] eta: 0:00:13 lr: 0.000108 grad: 0.0876 (0.0875) loss: 0.8223 (0.8148) time: 0.2685 data: 0.0002 max mem: 26157 Train: [27] [6249/6250] eta: 0:00:00 lr: 0.000108 grad: 0.0865 (0.0875) loss: 0.8258 (0.8148) time: 0.2696 data: 0.0002 max mem: 26157 Train: [27] Total time: 0:28:27 (0.2732 s / it) Averaged stats: lr: 0.000108 grad: 0.0865 (0.0875) loss: 0.8258 (0.8148) Eval (hcp-train-subset): [27] [ 0/62] eta: 0:03:55 loss: 0.8464 (0.8464) time: 3.7922 data: 3.6473 max mem: 26157 Eval (hcp-train-subset): [27] [61/62] eta: 0:00:00 loss: 0.8320 (0.8338) time: 0.1171 data: 0.0340 max mem: 26157 Eval (hcp-train-subset): [27] Total time: 0:00:12 (0.2026 s / it) Averaged stats (hcp-train-subset): loss: 0.8320 (0.8338) Making plots (hcp-train-subset): example=3 Eval (hcp-val): [27] [ 0/62] eta: 0:05:43 loss: 0.8243 (0.8243) time: 5.5353 data: 5.4511 max mem: 26157 Eval (hcp-val): [27] [61/62] eta: 0:00:00 loss: 0.8297 (0.8300) time: 0.1071 data: 0.0224 max mem: 26157 Eval (hcp-val): [27] Total time: 0:00:12 (0.2018 s / it) Averaged stats (hcp-val): loss: 0.8297 (0.8300) Making plots (hcp-val): example=51 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [28] [ 0/6250] eta: 10:18:35 lr: 0.000108 grad: 0.0687 (0.0687) loss: 0.8449 (0.8449) time: 5.9385 data: 5.6620 max mem: 26157 Train: [28] [ 100/6250] eta: 0:33:28 lr: 0.000108 grad: 0.0785 (0.0932) loss: 0.8257 (0.8283) time: 0.2691 data: 0.0002 max mem: 26157 Train: [28] [ 200/6250] eta: 0:30:12 lr: 0.000108 grad: 0.0873 (0.0910) loss: 0.8199 (0.8248) time: 0.2698 data: 0.0002 max mem: 26157 Train: [28] [ 300/6250] eta: 0:28:47 lr: 0.000108 grad: 0.0856 (0.0907) loss: 0.8191 (0.8225) time: 0.2701 data: 0.0003 max mem: 26157 Train: [28] [ 400/6250] eta: 0:28:43 lr: 0.000108 grad: 0.0842 (0.0901) loss: 0.8145 (0.8206) time: 0.3404 data: 0.0625 max mem: 26157 Train: [28] [ 500/6250] eta: 0:27:49 lr: 0.000108 grad: 0.0779 (0.0884) loss: 0.8292 (0.8207) time: 0.2702 data: 0.0003 max mem: 26157 Train: [28] [ 600/6250] eta: 0:27:02 lr: 0.000108 grad: 0.0826 (0.0875) loss: 0.8180 (0.8207) time: 0.2703 data: 0.0002 max mem: 26157 Train: [28] [ 700/6250] eta: 0:26:22 lr: 0.000108 grad: 0.0813 (0.0868) loss: 0.8187 (0.8205) time: 0.2743 data: 0.0002 max mem: 26157 Train: [28] [ 800/6250] eta: 0:26:05 lr: 0.000108 grad: 0.0773 (0.0863) loss: 0.8215 (0.8202) time: 0.3269 data: 0.0528 max mem: 26157 Train: [28] [ 900/6250] eta: 0:25:26 lr: 0.000108 grad: 0.0842 (0.0857) loss: 0.8197 (0.8201) time: 0.2710 data: 0.0002 max mem: 26157 Train: [28] [1000/6250] eta: 0:24:51 lr: 0.000108 grad: 0.0835 (0.0857) loss: 0.8121 (0.8199) time: 0.2701 data: 0.0003 max mem: 26157 Train: [28] [1100/6250] eta: 0:24:16 lr: 0.000108 grad: 0.0830 (0.0853) loss: 0.8143 (0.8196) time: 0.2699 data: 0.0002 max mem: 26157 Train: [28] [1200/6250] eta: 0:23:43 lr: 0.000108 grad: 0.0793 (0.0854) loss: 0.8123 (0.8192) time: 0.2716 data: 0.0002 max mem: 26157 Train: [28] [1300/6250] eta: 0:23:10 lr: 0.000107 grad: 0.0846 (0.0854) loss: 0.8112 (0.8189) time: 0.2708 data: 0.0002 max mem: 26157 Train: [28] [1400/6250] eta: 0:22:40 lr: 0.000107 grad: 0.0799 (0.0852) loss: 0.8182 (0.8187) time: 0.2779 data: 0.0002 max mem: 26157 Train: [28] [1500/6250] eta: 0:22:08 lr: 0.000107 grad: 0.0843 (0.0853) loss: 0.8203 (0.8185) time: 0.2684 data: 0.0002 max mem: 26157 Train: [28] [1600/6250] eta: 0:21:37 lr: 0.000107 grad: 0.0834 (0.0853) loss: 0.8161 (0.8184) time: 0.2705 data: 0.0002 max mem: 26157 Train: [28] [1700/6250] eta: 0:21:07 lr: 0.000107 grad: 0.0877 (0.0855) loss: 0.8158 (0.8181) time: 0.2700 data: 0.0002 max mem: 26157 Train: [28] [1800/6250] eta: 0:20:37 lr: 0.000107 grad: 0.0855 (0.0859) loss: 0.8202 (0.8179) time: 0.2713 data: 0.0002 max mem: 26157 Train: [28] [1900/6250] eta: 0:20:19 lr: 0.000107 grad: 0.0810 (0.0858) loss: 0.8215 (0.8178) time: 0.5068 data: 0.2323 max mem: 26157 Train: [28] [2000/6250] eta: 0:19:49 lr: 0.000107 grad: 0.0857 (0.0857) loss: 0.8167 (0.8178) time: 0.2701 data: 0.0002 max mem: 26157 Train: [28] [2100/6250] eta: 0:19:25 lr: 0.000107 grad: 0.0771 (0.0856) loss: 0.8194 (0.8179) time: 0.4142 data: 0.1334 max mem: 26157 Train: [28] [2200/6250] eta: 0:18:54 lr: 0.000107 grad: 0.0812 (0.0854) loss: 0.8227 (0.8179) time: 0.2692 data: 0.0002 max mem: 26157 Train: [28] [2300/6250] eta: 0:18:31 lr: 0.000107 grad: 0.0860 (0.0854) loss: 0.8210 (0.8178) time: 0.2692 data: 0.0002 max mem: 26157 Train: [28] [2400/6250] eta: 0:18:01 lr: 0.000107 grad: 0.0830 (0.0855) loss: 0.8163 (0.8178) time: 0.2683 data: 0.0002 max mem: 26157 Train: [28] [2500/6250] eta: 0:17:31 lr: 0.000107 grad: 0.0805 (0.0855) loss: 0.8164 (0.8177) time: 0.2675 data: 0.0002 max mem: 26157 Train: [28] [2600/6250] eta: 0:17:02 lr: 0.000107 grad: 0.0853 (0.0857) loss: 0.8130 (0.8175) time: 0.2678 data: 0.0002 max mem: 26157 Train: [28] [2700/6250] eta: 0:16:32 lr: 0.000107 grad: 0.0863 (0.0858) loss: 0.8133 (0.8173) time: 0.2683 data: 0.0002 max mem: 26157 Train: [28] [2800/6250] eta: 0:16:03 lr: 0.000107 grad: 0.0900 (0.0858) loss: 0.8063 (0.8171) time: 0.2691 data: 0.0002 max mem: 26157 Train: [28] [2900/6250] eta: 0:15:34 lr: 0.000107 grad: 0.0835 (0.0859) loss: 0.8186 (0.8169) time: 0.2686 data: 0.0002 max mem: 26157 Train: [28] [3000/6250] eta: 0:15:05 lr: 0.000107 grad: 0.0843 (0.0859) loss: 0.8125 (0.8168) time: 0.2718 data: 0.0002 max mem: 26157 Train: [28] [3100/6250] eta: 0:14:36 lr: 0.000107 grad: 0.0832 (0.0861) loss: 0.8125 (0.8167) time: 0.2695 data: 0.0002 max mem: 26157 Train: [28] [3200/6250] eta: 0:14:07 lr: 0.000107 grad: 0.0865 (0.0861) loss: 0.8073 (0.8166) time: 0.2701 data: 0.0002 max mem: 26157 Train: [28] [3300/6250] eta: 0:13:39 lr: 0.000107 grad: 0.0929 (0.0862) loss: 0.8092 (0.8165) time: 0.2699 data: 0.0002 max mem: 26157 Train: [28] [3400/6250] eta: 0:13:10 lr: 0.000107 grad: 0.0821 (0.0863) loss: 0.8175 (0.8164) time: 0.2696 data: 0.0002 max mem: 26157 Train: [28] [3500/6250] eta: 0:12:42 lr: 0.000107 grad: 0.0846 (0.0863) loss: 0.8147 (0.8164) time: 0.2710 data: 0.0002 max mem: 26157 Train: [28] [3600/6250] eta: 0:12:14 lr: 0.000107 grad: 0.0810 (0.0863) loss: 0.8162 (0.8164) time: 0.2701 data: 0.0002 max mem: 26157 Train: [28] [3700/6250] eta: 0:11:46 lr: 0.000107 grad: 0.0826 (0.0863) loss: 0.8101 (0.8163) time: 0.2701 data: 0.0002 max mem: 26157 Train: [28] [3800/6250] eta: 0:11:17 lr: 0.000107 grad: 0.0865 (0.0865) loss: 0.8180 (0.8162) time: 0.2716 data: 0.0002 max mem: 26157 Train: [28] [3900/6250] eta: 0:10:49 lr: 0.000107 grad: 0.0874 (0.0865) loss: 0.8128 (0.8161) time: 0.2696 data: 0.0002 max mem: 26157 Train: [28] [4000/6250] eta: 0:10:21 lr: 0.000107 grad: 0.0868 (0.0866) loss: 0.8116 (0.8161) time: 0.2690 data: 0.0002 max mem: 26157 Train: [28] [4100/6250] eta: 0:09:53 lr: 0.000107 grad: 0.0876 (0.0867) loss: 0.8161 (0.8160) time: 0.2686 data: 0.0002 max mem: 26157 Train: [28] [4200/6250] eta: 0:09:25 lr: 0.000107 grad: 0.0807 (0.0867) loss: 0.8174 (0.8161) time: 0.2701 data: 0.0002 max mem: 26157 Train: [28] [4300/6250] eta: 0:08:59 lr: 0.000107 grad: 0.0811 (0.0867) loss: 0.8148 (0.8161) time: 0.2701 data: 0.0002 max mem: 26157 Train: [28] [4400/6250] eta: 0:08:32 lr: 0.000107 grad: 0.0848 (0.0867) loss: 0.8200 (0.8161) time: 0.3681 data: 0.0828 max mem: 26157 Train: [28] [4500/6250] eta: 0:08:04 lr: 0.000107 grad: 0.0875 (0.0867) loss: 0.8145 (0.8161) time: 0.2687 data: 0.0002 max mem: 26157 Train: [28] [4600/6250] eta: 0:07:36 lr: 0.000107 grad: 0.0863 (0.0867) loss: 0.8150 (0.8161) time: 0.2688 data: 0.0002 max mem: 26157 Train: [28] [4700/6250] eta: 0:07:08 lr: 0.000107 grad: 0.0857 (0.0867) loss: 0.8123 (0.8161) time: 0.2699 data: 0.0002 max mem: 26157 Train: [28] [4800/6250] eta: 0:06:40 lr: 0.000107 grad: 0.0816 (0.0867) loss: 0.8150 (0.8161) time: 0.2678 data: 0.0002 max mem: 26157 Train: [28] [4900/6250] eta: 0:06:13 lr: 0.000107 grad: 0.0838 (0.0867) loss: 0.8204 (0.8162) time: 0.2704 data: 0.0002 max mem: 26157 Train: [28] [5000/6250] eta: 0:05:46 lr: 0.000107 grad: 0.0830 (0.0867) loss: 0.8192 (0.8162) time: 0.2714 data: 0.0002 max mem: 26157 Train: [28] [5100/6250] eta: 0:05:18 lr: 0.000107 grad: 0.0830 (0.0867) loss: 0.8145 (0.8162) time: 0.3457 data: 0.0714 max mem: 26157 Train: [28] [5200/6250] eta: 0:04:51 lr: 0.000107 grad: 0.0851 (0.0868) loss: 0.8190 (0.8162) time: 0.2688 data: 0.0002 max mem: 26157 Train: [28] [5300/6250] eta: 0:04:23 lr: 0.000107 grad: 0.0841 (0.0868) loss: 0.8181 (0.8163) time: 0.2697 data: 0.0002 max mem: 26157 Train: [28] [5400/6250] eta: 0:03:55 lr: 0.000107 grad: 0.0841 (0.0868) loss: 0.8209 (0.8163) time: 0.2707 data: 0.0002 max mem: 26157 Train: [28] [5500/6250] eta: 0:03:27 lr: 0.000107 grad: 0.0852 (0.0869) loss: 0.8130 (0.8163) time: 0.2678 data: 0.0002 max mem: 26157 Train: [28] [5600/6250] eta: 0:02:59 lr: 0.000106 grad: 0.0845 (0.0869) loss: 0.8181 (0.8163) time: 0.2715 data: 0.0002 max mem: 26157 Train: [28] [5700/6250] eta: 0:02:32 lr: 0.000106 grad: 0.0889 (0.0870) loss: 0.8198 (0.8164) time: 0.2696 data: 0.0001 max mem: 26157 Train: [28] [5800/6250] eta: 0:02:04 lr: 0.000106 grad: 0.0819 (0.0870) loss: 0.8174 (0.8164) time: 0.2706 data: 0.0002 max mem: 26157 Train: [28] [5900/6250] eta: 0:01:36 lr: 0.000106 grad: 0.0809 (0.0870) loss: 0.8187 (0.8164) time: 0.2727 data: 0.0002 max mem: 26157 Train: [28] [6000/6250] eta: 0:01:09 lr: 0.000106 grad: 0.0897 (0.0870) loss: 0.8155 (0.8164) time: 0.2701 data: 0.0002 max mem: 26157 Train: [28] [6100/6250] eta: 0:00:41 lr: 0.000106 grad: 0.0851 (0.0870) loss: 0.8202 (0.8164) time: 0.2688 data: 0.0002 max mem: 26157 Train: [28] [6200/6250] eta: 0:00:13 lr: 0.000106 grad: 0.0789 (0.0869) loss: 0.8155 (0.8164) time: 0.2706 data: 0.0002 max mem: 26157 Train: [28] [6249/6250] eta: 0:00:00 lr: 0.000106 grad: 0.0803 (0.0869) loss: 0.8216 (0.8165) time: 0.2679 data: 0.0002 max mem: 26157 Train: [28] Total time: 0:28:58 (0.2782 s / it) Averaged stats: lr: 0.000106 grad: 0.0803 (0.0869) loss: 0.8216 (0.8165) Eval (hcp-train-subset): [28] [ 0/62] eta: 0:03:35 loss: 0.8439 (0.8439) time: 3.4767 data: 3.3645 max mem: 26157 Eval (hcp-train-subset): [28] [61/62] eta: 0:00:00 loss: 0.8319 (0.8351) time: 0.1230 data: 0.0384 max mem: 26157 Eval (hcp-train-subset): [28] Total time: 0:00:12 (0.2092 s / it) Averaged stats (hcp-train-subset): loss: 0.8319 (0.8351) Making plots (hcp-train-subset): example=53 Eval (hcp-val): [28] [ 0/62] eta: 0:04:46 loss: 0.8274 (0.8274) time: 4.6218 data: 4.5377 max mem: 26157 Eval (hcp-val): [28] [61/62] eta: 0:00:00 loss: 0.8294 (0.8299) time: 0.1306 data: 0.0456 max mem: 26157 Eval (hcp-val): [28] Total time: 0:00:12 (0.2068 s / it) Averaged stats (hcp-val): loss: 0.8294 (0.8299) Making plots (hcp-val): example=47 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [29] [ 0/6250] eta: 11:18:07 lr: 0.000106 grad: 0.0932 (0.0932) loss: 0.8476 (0.8476) time: 6.5099 data: 6.2315 max mem: 26157 Train: [29] [ 100/6250] eta: 0:33:59 lr: 0.000106 grad: 0.0827 (0.0997) loss: 0.8244 (0.8290) time: 0.2690 data: 0.0002 max mem: 26157 Train: [29] [ 200/6250] eta: 0:30:18 lr: 0.000106 grad: 0.0831 (0.0935) loss: 0.8089 (0.8237) time: 0.2709 data: 0.0002 max mem: 26157 Train: [29] [ 300/6250] eta: 0:28:46 lr: 0.000106 grad: 0.0846 (0.0916) loss: 0.8154 (0.8210) time: 0.2690 data: 0.0002 max mem: 26157 Train: [29] [ 400/6250] eta: 0:27:51 lr: 0.000106 grad: 0.0831 (0.0912) loss: 0.8164 (0.8202) time: 0.2747 data: 0.0002 max mem: 26157 Train: [29] [ 500/6250] eta: 0:27:06 lr: 0.000106 grad: 0.0806 (0.0900) loss: 0.8145 (0.8202) time: 0.2688 data: 0.0002 max mem: 26157 Train: [29] [ 600/6250] eta: 0:26:26 lr: 0.000106 grad: 0.0786 (0.0890) loss: 0.8197 (0.8205) time: 0.2720 data: 0.0002 max mem: 26157 Train: [29] [ 700/6250] eta: 0:25:50 lr: 0.000106 grad: 0.0814 (0.0885) loss: 0.8251 (0.8210) time: 0.2723 data: 0.0002 max mem: 26157 Train: [29] [ 800/6250] eta: 0:25:16 lr: 0.000106 grad: 0.0853 (0.0882) loss: 0.8226 (0.8209) time: 0.2713 data: 0.0002 max mem: 26157 Train: [29] [ 900/6250] eta: 0:24:46 lr: 0.000106 grad: 0.0778 (0.0878) loss: 0.8265 (0.8210) time: 0.2796 data: 0.0002 max mem: 26157 Train: [29] [1000/6250] eta: 0:24:16 lr: 0.000106 grad: 0.0801 (0.0875) loss: 0.8153 (0.8206) time: 0.2696 data: 0.0002 max mem: 26157 Train: [29] [1100/6250] eta: 0:23:44 lr: 0.000106 grad: 0.0851 (0.0875) loss: 0.8097 (0.8203) time: 0.2689 data: 0.0002 max mem: 26157 Train: [29] [1200/6250] eta: 0:23:13 lr: 0.000106 grad: 0.0870 (0.0877) loss: 0.8179 (0.8198) time: 0.2692 data: 0.0002 max mem: 26157 Train: [29] [1300/6250] eta: 0:22:43 lr: 0.000106 grad: 0.0835 (0.0875) loss: 0.8149 (0.8195) time: 0.2687 data: 0.0002 max mem: 26157 Train: [29] [1400/6250] eta: 0:22:14 lr: 0.000106 grad: 0.0895 (0.0877) loss: 0.8025 (0.8189) time: 0.2687 data: 0.0002 max mem: 26157 Train: [29] [1500/6250] eta: 0:21:44 lr: 0.000106 grad: 0.0893 (0.0880) loss: 0.8173 (0.8185) time: 0.2689 data: 0.0002 max mem: 26157 Train: [29] [1600/6250] eta: 0:21:21 lr: 0.000106 grad: 0.0823 (0.0880) loss: 0.8140 (0.8182) time: 0.2732 data: 0.0002 max mem: 26157 Train: [29] [1700/6250] eta: 0:20:57 lr: 0.000106 grad: 0.0844 (0.0881) loss: 0.8171 (0.8179) time: 0.2689 data: 0.0002 max mem: 26157 Train: [29] [1800/6250] eta: 0:20:28 lr: 0.000106 grad: 0.0889 (0.0881) loss: 0.8153 (0.8176) time: 0.2717 data: 0.0002 max mem: 26157 Train: [29] [1900/6250] eta: 0:19:59 lr: 0.000106 grad: 0.0821 (0.0881) loss: 0.8079 (0.8173) time: 0.2725 data: 0.0002 max mem: 26157 Train: [29] [2000/6250] eta: 0:19:30 lr: 0.000106 grad: 0.0853 (0.0881) loss: 0.8108 (0.8171) time: 0.2709 data: 0.0002 max mem: 26157 Train: [29] [2100/6250] eta: 0:19:02 lr: 0.000106 grad: 0.0837 (0.0882) loss: 0.8114 (0.8169) time: 0.2700 data: 0.0002 max mem: 26157 Train: [29] [2200/6250] eta: 0:18:33 lr: 0.000106 grad: 0.0868 (0.0881) loss: 0.8132 (0.8167) time: 0.2704 data: 0.0002 max mem: 26157 Train: [29] [2300/6250] eta: 0:18:05 lr: 0.000106 grad: 0.0874 (0.0881) loss: 0.8157 (0.8166) time: 0.2693 data: 0.0002 max mem: 26157 Train: [29] [2400/6250] eta: 0:17:37 lr: 0.000106 grad: 0.0934 (0.0881) loss: 0.8065 (0.8165) time: 0.2714 data: 0.0002 max mem: 26157 Train: [29] [2500/6250] eta: 0:17:09 lr: 0.000106 grad: 0.0831 (0.0881) loss: 0.8167 (0.8163) time: 0.2699 data: 0.0002 max mem: 26157 Train: [29] [2600/6250] eta: 0:16:41 lr: 0.000106 grad: 0.0898 (0.0883) loss: 0.8118 (0.8160) time: 0.2685 data: 0.0002 max mem: 26157 Train: [29] [2700/6250] eta: 0:16:14 lr: 0.000106 grad: 0.0827 (0.0883) loss: 0.8107 (0.8158) time: 0.3179 data: 0.0461 max mem: 26157 Train: [29] [2800/6250] eta: 0:15:46 lr: 0.000106 grad: 0.0909 (0.0885) loss: 0.8126 (0.8156) time: 0.2676 data: 0.0002 max mem: 26157 Train: [29] [2900/6250] eta: 0:15:18 lr: 0.000106 grad: 0.0862 (0.0886) loss: 0.8098 (0.8153) time: 0.2712 data: 0.0002 max mem: 26157 Train: [29] [3000/6250] eta: 0:14:50 lr: 0.000106 grad: 0.0882 (0.0886) loss: 0.8135 (0.8151) time: 0.2691 data: 0.0002 max mem: 26157 Train: [29] [3100/6250] eta: 0:14:22 lr: 0.000106 grad: 0.0874 (0.0886) loss: 0.8134 (0.8150) time: 0.2697 data: 0.0002 max mem: 26157 Train: [29] [3200/6250] eta: 0:13:54 lr: 0.000106 grad: 0.0854 (0.0886) loss: 0.8087 (0.8149) time: 0.2698 data: 0.0002 max mem: 26157 Train: [29] [3300/6250] eta: 0:13:27 lr: 0.000106 grad: 0.0838 (0.0886) loss: 0.8140 (0.8149) time: 0.2708 data: 0.0002 max mem: 26157 Train: [29] [3400/6250] eta: 0:12:59 lr: 0.000106 grad: 0.0835 (0.0886) loss: 0.8066 (0.8148) time: 0.2691 data: 0.0002 max mem: 26157 Train: [29] [3500/6250] eta: 0:12:31 lr: 0.000105 grad: 0.0845 (0.0886) loss: 0.8134 (0.8147) time: 0.2682 data: 0.0002 max mem: 26157 Train: [29] [3600/6250] eta: 0:12:04 lr: 0.000105 grad: 0.0850 (0.0886) loss: 0.8141 (0.8147) time: 0.2683 data: 0.0001 max mem: 26157 Train: [29] [3700/6250] eta: 0:11:36 lr: 0.000105 grad: 0.0907 (0.0886) loss: 0.8151 (0.8147) time: 0.2681 data: 0.0002 max mem: 26157 Train: [29] [3800/6250] eta: 0:11:09 lr: 0.000105 grad: 0.0912 (0.0888) loss: 0.8160 (0.8147) time: 0.2693 data: 0.0002 max mem: 26157 Train: [29] [3900/6250] eta: 0:10:41 lr: 0.000105 grad: 0.0852 (0.0887) loss: 0.8137 (0.8147) time: 0.2693 data: 0.0002 max mem: 26157 Train: [29] [4000/6250] eta: 0:10:14 lr: 0.000105 grad: 0.0920 (0.0889) loss: 0.8154 (0.8147) time: 0.2698 data: 0.0002 max mem: 26157 Train: [29] [4100/6250] eta: 0:09:46 lr: 0.000105 grad: 0.0865 (0.0891) loss: 0.8131 (0.8147) time: 0.2683 data: 0.0002 max mem: 26157 Train: [29] [4200/6250] eta: 0:09:19 lr: 0.000105 grad: 0.0804 (0.0890) loss: 0.8180 (0.8147) time: 0.2692 data: 0.0002 max mem: 26157 Train: [29] [4300/6250] eta: 0:08:51 lr: 0.000105 grad: 0.0873 (0.0891) loss: 0.8080 (0.8147) time: 0.2684 data: 0.0002 max mem: 26157 Train: [29] [4400/6250] eta: 0:08:24 lr: 0.000105 grad: 0.0805 (0.0891) loss: 0.8132 (0.8146) time: 0.2696 data: 0.0002 max mem: 26157 Train: [29] [4500/6250] eta: 0:07:56 lr: 0.000105 grad: 0.0900 (0.0891) loss: 0.8001 (0.8146) time: 0.2693 data: 0.0002 max mem: 26157 Train: [29] [4600/6250] eta: 0:07:29 lr: 0.000105 grad: 0.0909 (0.0891) loss: 0.8181 (0.8145) time: 0.2701 data: 0.0002 max mem: 26157 Train: [29] [4700/6250] eta: 0:07:02 lr: 0.000105 grad: 0.0841 (0.0891) loss: 0.8142 (0.8145) time: 0.2704 data: 0.0002 max mem: 26157 Train: [29] [4800/6250] eta: 0:06:34 lr: 0.000105 grad: 0.0836 (0.0891) loss: 0.8168 (0.8145) time: 0.2704 data: 0.0002 max mem: 26157 Train: [29] [4900/6250] eta: 0:06:07 lr: 0.000105 grad: 0.0889 (0.0891) loss: 0.8135 (0.8145) time: 0.2695 data: 0.0002 max mem: 26157 Train: [29] [5000/6250] eta: 0:05:40 lr: 0.000105 grad: 0.0864 (0.0891) loss: 0.8142 (0.8145) time: 0.2683 data: 0.0002 max mem: 26157 Train: [29] [5100/6250] eta: 0:05:12 lr: 0.000105 grad: 0.0817 (0.0890) loss: 0.8141 (0.8145) time: 0.2691 data: 0.0002 max mem: 26157 Train: [29] [5200/6250] eta: 0:04:45 lr: 0.000105 grad: 0.0871 (0.0890) loss: 0.8144 (0.8145) time: 0.2679 data: 0.0002 max mem: 26157 Train: [29] [5300/6250] eta: 0:04:18 lr: 0.000105 grad: 0.0820 (0.0890) loss: 0.8124 (0.8145) time: 0.2682 data: 0.0002 max mem: 26157 Train: [29] [5400/6250] eta: 0:03:51 lr: 0.000105 grad: 0.0851 (0.0890) loss: 0.8168 (0.8146) time: 0.2684 data: 0.0002 max mem: 26157 Train: [29] [5500/6250] eta: 0:03:23 lr: 0.000105 grad: 0.0821 (0.0890) loss: 0.8194 (0.8147) time: 0.2693 data: 0.0002 max mem: 26157 Train: [29] [5600/6250] eta: 0:02:57 lr: 0.000105 grad: 0.0851 (0.0889) loss: 0.8146 (0.8147) time: 0.3857 data: 0.1082 max mem: 26157 Train: [29] [5700/6250] eta: 0:02:30 lr: 0.000105 grad: 0.0893 (0.0890) loss: 0.8150 (0.8147) time: 0.2686 data: 0.0001 max mem: 26157 Train: [29] [5800/6250] eta: 0:02:02 lr: 0.000105 grad: 0.0851 (0.0892) loss: 0.8216 (0.8147) time: 0.2697 data: 0.0002 max mem: 26157 Train: [29] [5900/6250] eta: 0:01:35 lr: 0.000105 grad: 0.0857 (0.0892) loss: 0.8142 (0.8147) time: 0.2700 data: 0.0002 max mem: 26157 Train: [29] [6000/6250] eta: 0:01:08 lr: 0.000105 grad: 0.0862 (0.0892) loss: 0.8159 (0.8147) time: 0.2720 data: 0.0001 max mem: 26157 Train: [29] [6100/6250] eta: 0:00:40 lr: 0.000105 grad: 0.0911 (0.0894) loss: 0.8159 (0.8147) time: 0.2690 data: 0.0002 max mem: 26157 Train: [29] [6200/6250] eta: 0:00:13 lr: 0.000105 grad: 0.0889 (0.0894) loss: 0.8167 (0.8147) time: 0.2706 data: 0.0002 max mem: 26157 Train: [29] [6249/6250] eta: 0:00:00 lr: 0.000105 grad: 0.0858 (0.0894) loss: 0.8129 (0.8147) time: 0.2697 data: 0.0002 max mem: 26157 Train: [29] Total time: 0:28:31 (0.2739 s / it) Averaged stats: lr: 0.000105 grad: 0.0858 (0.0894) loss: 0.8129 (0.8147) Eval (hcp-train-subset): [29] [ 0/62] eta: 0:03:48 loss: 0.8464 (0.8464) time: 3.6790 data: 3.5529 max mem: 26157 Eval (hcp-train-subset): [29] [61/62] eta: 0:00:00 loss: 0.8328 (0.8341) time: 0.1362 data: 0.0533 max mem: 26157 Eval (hcp-train-subset): [29] Total time: 0:00:13 (0.2108 s / it) Averaged stats (hcp-train-subset): loss: 0.8328 (0.8341) Making plots (hcp-train-subset): example=15 Eval (hcp-val): [29] [ 0/62] eta: 0:04:54 loss: 0.8253 (0.8253) time: 4.7499 data: 4.6662 max mem: 26157 Eval (hcp-val): [29] [61/62] eta: 0:00:00 loss: 0.8290 (0.8294) time: 0.1283 data: 0.0449 max mem: 26157 Eval (hcp-val): [29] Total time: 0:00:12 (0.2066 s / it) Averaged stats (hcp-val): loss: 0.8290 (0.8294) Making plots (hcp-val): example=8 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [30] [ 0/6250] eta: 7:26:34 lr: 0.000105 grad: 0.0570 (0.0570) loss: 0.8668 (0.8668) time: 4.2871 data: 3.9217 max mem: 26157 Train: [30] [ 100/6250] eta: 0:34:07 lr: 0.000105 grad: 0.1053 (0.1077) loss: 0.8053 (0.8261) time: 0.2710 data: 0.0002 max mem: 26157 Train: [30] [ 200/6250] eta: 0:30:23 lr: 0.000105 grad: 0.0908 (0.1014) loss: 0.8223 (0.8225) time: 0.2690 data: 0.0002 max mem: 26157 Train: [30] [ 300/6250] eta: 0:28:51 lr: 0.000105 grad: 0.0885 (0.1001) loss: 0.8122 (0.8203) time: 0.2698 data: 0.0002 max mem: 26157 Train: [30] [ 400/6250] eta: 0:27:52 lr: 0.000105 grad: 0.0822 (0.0978) loss: 0.8182 (0.8193) time: 0.2712 data: 0.0002 max mem: 26157 Train: [30] [ 500/6250] eta: 0:27:05 lr: 0.000105 grad: 0.0904 (0.0965) loss: 0.8189 (0.8182) time: 0.2694 data: 0.0002 max mem: 26157 Train: [30] [ 600/6250] eta: 0:26:25 lr: 0.000105 grad: 0.0808 (0.0956) loss: 0.8187 (0.8175) time: 0.2718 data: 0.0002 max mem: 26157 Train: [30] [ 700/6250] eta: 0:25:49 lr: 0.000105 grad: 0.0814 (0.0947) loss: 0.8206 (0.8170) time: 0.2696 data: 0.0002 max mem: 26157 Train: [30] [ 800/6250] eta: 0:25:15 lr: 0.000105 grad: 0.0808 (0.0938) loss: 0.8182 (0.8168) time: 0.2692 data: 0.0001 max mem: 26157 Train: [30] [ 900/6250] eta: 0:24:42 lr: 0.000105 grad: 0.0828 (0.0931) loss: 0.8147 (0.8166) time: 0.2689 data: 0.0002 max mem: 26157 Train: [30] [1000/6250] eta: 0:24:10 lr: 0.000105 grad: 0.0842 (0.0926) loss: 0.8216 (0.8165) time: 0.2688 data: 0.0002 max mem: 26157 Train: [30] [1100/6250] eta: 0:23:39 lr: 0.000105 grad: 0.0823 (0.0920) loss: 0.8133 (0.8163) time: 0.2684 data: 0.0002 max mem: 26157 Train: [30] [1200/6250] eta: 0:23:09 lr: 0.000105 grad: 0.0828 (0.0918) loss: 0.8133 (0.8161) time: 0.2698 data: 0.0002 max mem: 26157 Train: [30] [1300/6250] eta: 0:22:39 lr: 0.000105 grad: 0.0888 (0.0927) loss: 0.8138 (0.8158) time: 0.2697 data: 0.0001 max mem: 26157 Train: [30] [1400/6250] eta: 0:22:10 lr: 0.000104 grad: 0.0829 (0.0923) loss: 0.8119 (0.8156) time: 0.2703 data: 0.0002 max mem: 26157 Train: [30] [1500/6250] eta: 0:21:41 lr: 0.000104 grad: 0.0860 (0.0921) loss: 0.8211 (0.8157) time: 0.2689 data: 0.0004 max mem: 26157 Train: [30] [1600/6250] eta: 0:21:12 lr: 0.000104 grad: 0.0898 (0.0918) loss: 0.8135 (0.8156) time: 0.2695 data: 0.0002 max mem: 26157 Train: [30] [1700/6250] eta: 0:20:44 lr: 0.000104 grad: 0.0851 (0.0915) loss: 0.8129 (0.8157) time: 0.2764 data: 0.0002 max mem: 26157 Train: [30] [1800/6250] eta: 0:20:29 lr: 0.000104 grad: 0.0859 (0.0915) loss: 0.8161 (0.8156) time: 0.2686 data: 0.0002 max mem: 26157 Train: [30] [1900/6250] eta: 0:20:00 lr: 0.000104 grad: 0.0793 (0.0912) loss: 0.8220 (0.8156) time: 0.2694 data: 0.0002 max mem: 26157 Train: [30] [2000/6250] eta: 0:19:40 lr: 0.000104 grad: 0.0881 (0.0909) loss: 0.8114 (0.8157) time: 0.4830 data: 0.2094 max mem: 26157 Train: [30] [2100/6250] eta: 0:19:21 lr: 0.000104 grad: 0.0880 (0.0908) loss: 0.8140 (0.8156) time: 0.2690 data: 0.0002 max mem: 26157 Train: [30] [2200/6250] eta: 0:18:56 lr: 0.000104 grad: 0.0873 (0.0906) loss: 0.8139 (0.8156) time: 0.2704 data: 0.0002 max mem: 26157 Train: [30] [2300/6250] eta: 0:18:26 lr: 0.000104 grad: 0.0837 (0.0905) loss: 0.8135 (0.8156) time: 0.2734 data: 0.0002 max mem: 26157 Train: [30] [2400/6250] eta: 0:17:57 lr: 0.000104 grad: 0.0857 (0.0903) loss: 0.8159 (0.8156) time: 0.2705 data: 0.0002 max mem: 26157 Train: [30] [2500/6250] eta: 0:17:27 lr: 0.000104 grad: 0.0859 (0.0902) loss: 0.8178 (0.8155) time: 0.2690 data: 0.0002 max mem: 26157 Train: [30] [2600/6250] eta: 0:16:58 lr: 0.000104 grad: 0.0849 (0.0901) loss: 0.8165 (0.8156) time: 0.2719 data: 0.0002 max mem: 26157 Train: [30] [2700/6250] eta: 0:16:29 lr: 0.000104 grad: 0.0818 (0.0900) loss: 0.8187 (0.8157) time: 0.2693 data: 0.0002 max mem: 26157 Train: [30] [2800/6250] eta: 0:16:09 lr: 0.000104 grad: 0.0825 (0.0899) loss: 0.8272 (0.8159) time: 0.2713 data: 0.0002 max mem: 26157 Train: [30] [2900/6250] eta: 0:15:40 lr: 0.000104 grad: 0.0848 (0.0899) loss: 0.8188 (0.8159) time: 0.2699 data: 0.0002 max mem: 26157 Train: [30] [3000/6250] eta: 0:15:10 lr: 0.000104 grad: 0.0864 (0.0898) loss: 0.8211 (0.8159) time: 0.2695 data: 0.0002 max mem: 26157 Train: [30] [3100/6250] eta: 0:14:41 lr: 0.000104 grad: 0.0872 (0.0898) loss: 0.8107 (0.8159) time: 0.2709 data: 0.0002 max mem: 26157 Train: [30] [3200/6250] eta: 0:14:12 lr: 0.000104 grad: 0.0900 (0.0897) loss: 0.8161 (0.8159) time: 0.2692 data: 0.0002 max mem: 26157 Train: [30] [3300/6250] eta: 0:13:44 lr: 0.000104 grad: 0.0829 (0.0897) loss: 0.8155 (0.8159) time: 0.2695 data: 0.0002 max mem: 26157 Train: [30] [3400/6250] eta: 0:13:15 lr: 0.000104 grad: 0.0856 (0.0896) loss: 0.8152 (0.8160) time: 0.2677 data: 0.0002 max mem: 26157 Train: [30] [3500/6250] eta: 0:12:46 lr: 0.000104 grad: 0.0885 (0.0896) loss: 0.8145 (0.8160) time: 0.2679 data: 0.0002 max mem: 26157 Train: [30] [3600/6250] eta: 0:12:22 lr: 0.000104 grad: 0.0828 (0.0897) loss: 0.8196 (0.8160) time: 0.2703 data: 0.0002 max mem: 26157 Train: [30] [3700/6250] eta: 0:11:59 lr: 0.000104 grad: 0.0844 (0.0897) loss: 0.8129 (0.8160) time: 0.2706 data: 0.0002 max mem: 26157 Train: [30] [3800/6250] eta: 0:11:30 lr: 0.000104 grad: 0.0875 (0.0896) loss: 0.8138 (0.8160) time: 0.2701 data: 0.0002 max mem: 26157 Train: [30] [3900/6250] eta: 0:11:01 lr: 0.000104 grad: 0.0925 (0.0896) loss: 0.8180 (0.8160) time: 0.2693 data: 0.0002 max mem: 26157 Train: [30] [4000/6250] eta: 0:10:33 lr: 0.000104 grad: 0.0836 (0.0895) loss: 0.8186 (0.8160) time: 0.2696 data: 0.0002 max mem: 26157 Train: [30] [4100/6250] eta: 0:10:04 lr: 0.000104 grad: 0.0856 (0.0895) loss: 0.8148 (0.8160) time: 0.2679 data: 0.0002 max mem: 26157 Train: [30] [4200/6250] eta: 0:09:35 lr: 0.000104 grad: 0.0880 (0.0895) loss: 0.8215 (0.8160) time: 0.2684 data: 0.0002 max mem: 26157 Train: [30] [4300/6250] eta: 0:09:06 lr: 0.000104 grad: 0.0912 (0.0896) loss: 0.8134 (0.8160) time: 0.2690 data: 0.0002 max mem: 26157 Train: [30] [4400/6250] eta: 0:08:38 lr: 0.000104 grad: 0.0898 (0.0897) loss: 0.8173 (0.8160) time: 0.2687 data: 0.0002 max mem: 26157 Train: [30] [4500/6250] eta: 0:08:09 lr: 0.000104 grad: 0.0877 (0.0898) loss: 0.8079 (0.8160) time: 0.2693 data: 0.0002 max mem: 26157 Train: [30] [4600/6250] eta: 0:07:41 lr: 0.000104 grad: 0.0876 (0.0898) loss: 0.8118 (0.8160) time: 0.2680 data: 0.0002 max mem: 26157 Train: [30] [4700/6250] eta: 0:07:13 lr: 0.000104 grad: 0.0836 (0.0899) loss: 0.8191 (0.8160) time: 0.2704 data: 0.0002 max mem: 26157 Train: [30] [4800/6250] eta: 0:06:44 lr: 0.000104 grad: 0.0920 (0.0899) loss: 0.8113 (0.8159) time: 0.2687 data: 0.0002 max mem: 26157 Train: [30] [4900/6250] eta: 0:06:16 lr: 0.000104 grad: 0.0893 (0.0901) loss: 0.8172 (0.8158) time: 0.2692 data: 0.0002 max mem: 26157 Train: [30] [5000/6250] eta: 0:05:48 lr: 0.000104 grad: 0.0869 (0.0902) loss: 0.8186 (0.8158) time: 0.2698 data: 0.0002 max mem: 26157 Train: [30] [5100/6250] eta: 0:05:20 lr: 0.000104 grad: 0.0840 (0.0902) loss: 0.8160 (0.8157) time: 0.2681 data: 0.0002 max mem: 26157 Train: [30] [5200/6250] eta: 0:04:52 lr: 0.000104 grad: 0.0847 (0.0902) loss: 0.8152 (0.8157) time: 0.2703 data: 0.0002 max mem: 26157 Train: [30] [5300/6250] eta: 0:04:24 lr: 0.000104 grad: 0.0846 (0.0902) loss: 0.8123 (0.8157) time: 0.2693 data: 0.0002 max mem: 26157 Train: [30] [5400/6250] eta: 0:03:56 lr: 0.000103 grad: 0.0898 (0.0902) loss: 0.8168 (0.8157) time: 0.2688 data: 0.0002 max mem: 26157 Train: [30] [5500/6250] eta: 0:03:28 lr: 0.000103 grad: 0.0877 (0.0903) loss: 0.8143 (0.8156) time: 0.2692 data: 0.0002 max mem: 26157 Train: [30] [5600/6250] eta: 0:03:00 lr: 0.000103 grad: 0.0873 (0.0903) loss: 0.8179 (0.8156) time: 0.2704 data: 0.0002 max mem: 26157 Train: [30] [5700/6250] eta: 0:02:32 lr: 0.000103 grad: 0.0877 (0.0903) loss: 0.8146 (0.8156) time: 0.2687 data: 0.0002 max mem: 26157 Train: [30] [5800/6250] eta: 0:02:04 lr: 0.000103 grad: 0.0974 (0.0903) loss: 0.8092 (0.8156) time: 0.2689 data: 0.0001 max mem: 26157 Train: [30] [5900/6250] eta: 0:01:37 lr: 0.000103 grad: 0.0873 (0.0904) loss: 0.8120 (0.8155) time: 0.2776 data: 0.0003 max mem: 26157 Train: [30] [6000/6250] eta: 0:01:09 lr: 0.000103 grad: 0.0857 (0.0903) loss: 0.8115 (0.8155) time: 0.2700 data: 0.0002 max mem: 26157 Train: [30] [6100/6250] eta: 0:00:41 lr: 0.000103 grad: 0.0917 (0.0903) loss: 0.8105 (0.8155) time: 0.2721 data: 0.0002 max mem: 26157 Train: [30] [6200/6250] eta: 0:00:13 lr: 0.000103 grad: 0.0947 (0.0904) loss: 0.8121 (0.8154) time: 0.2693 data: 0.0002 max mem: 26157 Train: [30] [6249/6250] eta: 0:00:00 lr: 0.000103 grad: 0.0883 (0.0904) loss: 0.8114 (0.8153) time: 0.2693 data: 0.0002 max mem: 26157 Train: [30] Total time: 0:29:04 (0.2791 s / it) Averaged stats: lr: 0.000103 grad: 0.0883 (0.0904) loss: 0.8114 (0.8153) Eval (hcp-train-subset): [30] [ 0/62] eta: 0:04:10 loss: 0.8474 (0.8474) time: 4.0343 data: 3.9157 max mem: 26157 Eval (hcp-train-subset): [30] [61/62] eta: 0:00:00 loss: 0.8324 (0.8329) time: 0.1165 data: 0.0315 max mem: 26157 Eval (hcp-train-subset): [30] Total time: 0:00:12 (0.2017 s / it) Averaged stats (hcp-train-subset): loss: 0.8324 (0.8329) Making plots (hcp-train-subset): example=43 Eval (hcp-val): [30] [ 0/62] eta: 0:04:41 loss: 0.8258 (0.8258) time: 4.5404 data: 4.4563 max mem: 26157 Eval (hcp-val): [30] [61/62] eta: 0:00:00 loss: 0.8286 (0.8298) time: 0.1311 data: 0.0478 max mem: 26157 Eval (hcp-val): [30] Total time: 0:00:13 (0.2124 s / it) Averaged stats (hcp-val): loss: 0.8286 (0.8298) Making plots (hcp-val): example=6 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [31] [ 0/6250] eta: 10:14:42 lr: 0.000103 grad: 0.0812 (0.0812) loss: 0.8323 (0.8323) time: 5.9011 data: 5.6057 max mem: 26157 Train: [31] [ 100/6250] eta: 0:33:32 lr: 0.000103 grad: 0.0873 (0.1033) loss: 0.8179 (0.8284) time: 0.2707 data: 0.0002 max mem: 26157 Train: [31] [ 200/6250] eta: 0:30:11 lr: 0.000103 grad: 0.0862 (0.0968) loss: 0.8200 (0.8258) time: 0.2699 data: 0.0002 max mem: 26157 Train: [31] [ 300/6250] eta: 0:28:44 lr: 0.000103 grad: 0.0759 (0.0932) loss: 0.8250 (0.8239) time: 0.2694 data: 0.0002 max mem: 26157 Train: [31] [ 400/6250] eta: 0:27:45 lr: 0.000103 grad: 0.0928 (0.0930) loss: 0.8143 (0.8216) time: 0.2694 data: 0.0002 max mem: 26157 Train: [31] [ 500/6250] eta: 0:26:59 lr: 0.000103 grad: 0.0804 (0.0917) loss: 0.8156 (0.8201) time: 0.2696 data: 0.0002 max mem: 26157 Train: [31] [ 600/6250] eta: 0:26:19 lr: 0.000103 grad: 0.0887 (0.0918) loss: 0.8156 (0.8192) time: 0.2684 data: 0.0002 max mem: 26157 Train: [31] [ 700/6250] eta: 0:25:43 lr: 0.000103 grad: 0.0834 (0.0913) loss: 0.8140 (0.8185) time: 0.2687 data: 0.0002 max mem: 26157 Train: [31] [ 800/6250] eta: 0:25:09 lr: 0.000103 grad: 0.0809 (0.0906) loss: 0.8182 (0.8182) time: 0.2712 data: 0.0002 max mem: 26157 Train: [31] [ 900/6250] eta: 0:24:38 lr: 0.000103 grad: 0.0789 (0.0900) loss: 0.8218 (0.8182) time: 0.2753 data: 0.0002 max mem: 26157 Train: [31] [1000/6250] eta: 0:24:12 lr: 0.000103 grad: 0.0835 (0.0896) loss: 0.8162 (0.8181) time: 0.2686 data: 0.0002 max mem: 26157 Train: [31] [1100/6250] eta: 0:23:41 lr: 0.000103 grad: 0.0828 (0.0891) loss: 0.8180 (0.8181) time: 0.2687 data: 0.0002 max mem: 26157 Train: [31] [1200/6250] eta: 0:23:11 lr: 0.000103 grad: 0.0877 (0.0889) loss: 0.8181 (0.8180) time: 0.2751 data: 0.0002 max mem: 26157 Train: [31] [1300/6250] eta: 0:22:41 lr: 0.000103 grad: 0.0805 (0.0887) loss: 0.8144 (0.8177) time: 0.2695 data: 0.0003 max mem: 26157 Train: [31] [1400/6250] eta: 0:22:12 lr: 0.000103 grad: 0.0856 (0.0888) loss: 0.8104 (0.8174) time: 0.2676 data: 0.0001 max mem: 26157 Train: [31] [1500/6250] eta: 0:21:42 lr: 0.000103 grad: 0.0887 (0.0887) loss: 0.8118 (0.8170) time: 0.2684 data: 0.0002 max mem: 26157 Train: [31] [1600/6250] eta: 0:21:13 lr: 0.000103 grad: 0.0917 (0.0886) loss: 0.8106 (0.8167) time: 0.2688 data: 0.0002 max mem: 26157 Train: [31] [1700/6250] eta: 0:20:45 lr: 0.000103 grad: 0.0846 (0.0888) loss: 0.8134 (0.8165) time: 0.2720 data: 0.0002 max mem: 26157 Train: [31] [1800/6250] eta: 0:20:16 lr: 0.000103 grad: 0.0812 (0.0887) loss: 0.8133 (0.8165) time: 0.2699 data: 0.0002 max mem: 26157 Train: [31] [1900/6250] eta: 0:19:48 lr: 0.000103 grad: 0.0889 (0.0886) loss: 0.8144 (0.8164) time: 0.2692 data: 0.0002 max mem: 26157 Train: [31] [2000/6250] eta: 0:19:20 lr: 0.000103 grad: 0.0839 (0.0885) loss: 0.8207 (0.8163) time: 0.2693 data: 0.0001 max mem: 26157 Train: [31] [2100/6250] eta: 0:18:52 lr: 0.000103 grad: 0.0829 (0.0885) loss: 0.8189 (0.8163) time: 0.2689 data: 0.0002 max mem: 26157 Train: [31] [2200/6250] eta: 0:18:24 lr: 0.000103 grad: 0.0862 (0.0884) loss: 0.8158 (0.8163) time: 0.2708 data: 0.0002 max mem: 26157 Train: [31] [2300/6250] eta: 0:17:56 lr: 0.000103 grad: 0.0872 (0.0884) loss: 0.8215 (0.8163) time: 0.2693 data: 0.0002 max mem: 26157 Train: [31] [2400/6250] eta: 0:17:29 lr: 0.000103 grad: 0.0840 (0.0883) loss: 0.8115 (0.8163) time: 0.2684 data: 0.0002 max mem: 26157 Train: [31] [2500/6250] eta: 0:17:01 lr: 0.000103 grad: 0.0825 (0.0884) loss: 0.8156 (0.8163) time: 0.2708 data: 0.0002 max mem: 26157 Train: [31] [2600/6250] eta: 0:16:33 lr: 0.000103 grad: 0.0927 (0.0885) loss: 0.8133 (0.8162) time: 0.2702 data: 0.0002 max mem: 26157 Train: [31] [2700/6250] eta: 0:16:06 lr: 0.000103 grad: 0.0857 (0.0886) loss: 0.8190 (0.8162) time: 0.2698 data: 0.0002 max mem: 26157 Train: [31] [2800/6250] eta: 0:15:38 lr: 0.000103 grad: 0.0839 (0.0886) loss: 0.8221 (0.8162) time: 0.2721 data: 0.0002 max mem: 26157 Train: [31] [2900/6250] eta: 0:15:11 lr: 0.000103 grad: 0.0826 (0.0886) loss: 0.8190 (0.8162) time: 0.2683 data: 0.0002 max mem: 26157 Train: [31] [3000/6250] eta: 0:14:43 lr: 0.000103 grad: 0.0943 (0.0887) loss: 0.8179 (0.8162) time: 0.2695 data: 0.0002 max mem: 26157 Train: [31] [3100/6250] eta: 0:14:16 lr: 0.000103 grad: 0.0865 (0.0888) loss: 0.8176 (0.8161) time: 0.2724 data: 0.0002 max mem: 26157 Train: [31] [3200/6250] eta: 0:13:49 lr: 0.000102 grad: 0.0831 (0.0888) loss: 0.8165 (0.8161) time: 0.2721 data: 0.0002 max mem: 26157 Train: [31] [3300/6250] eta: 0:13:22 lr: 0.000102 grad: 0.0878 (0.0889) loss: 0.8145 (0.8160) time: 0.2691 data: 0.0002 max mem: 26157 Train: [31] [3400/6250] eta: 0:12:54 lr: 0.000102 grad: 0.0839 (0.0890) loss: 0.8171 (0.8160) time: 0.2686 data: 0.0002 max mem: 26157 Train: [31] [3500/6250] eta: 0:12:28 lr: 0.000102 grad: 0.0878 (0.0891) loss: 0.8199 (0.8161) time: 0.2692 data: 0.0002 max mem: 26157 Train: [31] [3600/6250] eta: 0:12:01 lr: 0.000102 grad: 0.0920 (0.0891) loss: 0.8115 (0.8160) time: 0.2693 data: 0.0002 max mem: 26157 Train: [31] [3700/6250] eta: 0:11:34 lr: 0.000102 grad: 0.0889 (0.0893) loss: 0.8086 (0.8159) time: 0.2741 data: 0.0002 max mem: 26157 Train: [31] [3800/6250] eta: 0:11:07 lr: 0.000102 grad: 0.0887 (0.0894) loss: 0.8146 (0.8158) time: 0.2686 data: 0.0002 max mem: 26157 Train: [31] [3900/6250] eta: 0:10:39 lr: 0.000102 grad: 0.0987 (0.0896) loss: 0.8024 (0.8156) time: 0.2686 data: 0.0002 max mem: 26157 Train: [31] [4000/6250] eta: 0:10:12 lr: 0.000102 grad: 0.0901 (0.0897) loss: 0.8109 (0.8155) time: 0.2687 data: 0.0002 max mem: 26157 Train: [31] [4100/6250] eta: 0:09:44 lr: 0.000102 grad: 0.0903 (0.0898) loss: 0.8119 (0.8154) time: 0.2713 data: 0.0002 max mem: 26157 Train: [31] [4200/6250] eta: 0:09:17 lr: 0.000102 grad: 0.0891 (0.0898) loss: 0.8093 (0.8154) time: 0.2706 data: 0.0002 max mem: 26157 Train: [31] [4300/6250] eta: 0:08:50 lr: 0.000102 grad: 0.0963 (0.0899) loss: 0.8132 (0.8153) time: 0.2679 data: 0.0001 max mem: 26157 Train: [31] [4400/6250] eta: 0:08:22 lr: 0.000102 grad: 0.0874 (0.0899) loss: 0.8095 (0.8152) time: 0.2718 data: 0.0002 max mem: 26157 Train: [31] [4500/6250] eta: 0:07:55 lr: 0.000102 grad: 0.0886 (0.0899) loss: 0.8100 (0.8151) time: 0.2721 data: 0.0004 max mem: 26157 Train: [31] [4600/6250] eta: 0:07:28 lr: 0.000102 grad: 0.0936 (0.0900) loss: 0.8130 (0.8150) time: 0.2689 data: 0.0002 max mem: 26157 Train: [31] [4700/6250] eta: 0:07:01 lr: 0.000102 grad: 0.0907 (0.0901) loss: 0.8159 (0.8149) time: 0.2779 data: 0.0002 max mem: 26157 Train: [31] [4800/6250] eta: 0:06:34 lr: 0.000102 grad: 0.0902 (0.0901) loss: 0.8123 (0.8149) time: 0.2684 data: 0.0002 max mem: 26157 Train: [31] [4900/6250] eta: 0:06:06 lr: 0.000102 grad: 0.0871 (0.0901) loss: 0.8112 (0.8148) time: 0.2690 data: 0.0002 max mem: 26157 Train: [31] [5000/6250] eta: 0:05:39 lr: 0.000102 grad: 0.0865 (0.0901) loss: 0.8101 (0.8147) time: 0.2693 data: 0.0002 max mem: 26157 Train: [31] [5100/6250] eta: 0:05:12 lr: 0.000102 grad: 0.0868 (0.0901) loss: 0.8111 (0.8147) time: 0.2681 data: 0.0002 max mem: 26157 Train: [31] [5200/6250] eta: 0:04:45 lr: 0.000102 grad: 0.0895 (0.0901) loss: 0.8134 (0.8147) time: 0.2695 data: 0.0002 max mem: 26157 Train: [31] [5300/6250] eta: 0:04:17 lr: 0.000102 grad: 0.0912 (0.0902) loss: 0.8128 (0.8147) time: 0.2702 data: 0.0002 max mem: 26157 Train: [31] [5400/6250] eta: 0:03:50 lr: 0.000102 grad: 0.0936 (0.0902) loss: 0.8148 (0.8146) time: 0.2675 data: 0.0001 max mem: 26157 Train: [31] [5500/6250] eta: 0:03:23 lr: 0.000102 grad: 0.0924 (0.0902) loss: 0.8108 (0.8146) time: 0.2687 data: 0.0002 max mem: 26157 Train: [31] [5600/6250] eta: 0:02:56 lr: 0.000102 grad: 0.0907 (0.0903) loss: 0.8067 (0.8145) time: 0.2681 data: 0.0002 max mem: 26157 Train: [31] [5700/6250] eta: 0:02:29 lr: 0.000102 grad: 0.0917 (0.0903) loss: 0.8092 (0.8145) time: 0.2742 data: 0.0002 max mem: 26157 Train: [31] [5800/6250] eta: 0:02:02 lr: 0.000102 grad: 0.0857 (0.0903) loss: 0.8105 (0.8144) time: 0.2692 data: 0.0002 max mem: 26157 Train: [31] [5900/6250] eta: 0:01:35 lr: 0.000102 grad: 0.0923 (0.0904) loss: 0.8101 (0.8143) time: 0.2699 data: 0.0002 max mem: 26157 Train: [31] [6000/6250] eta: 0:01:07 lr: 0.000102 grad: 0.0878 (0.0903) loss: 0.8100 (0.8144) time: 0.2690 data: 0.0002 max mem: 26157 Train: [31] [6100/6250] eta: 0:00:40 lr: 0.000102 grad: 0.0862 (0.0903) loss: 0.8096 (0.8143) time: 0.2714 data: 0.0002 max mem: 26157 Train: [31] [6200/6250] eta: 0:00:13 lr: 0.000102 grad: 0.0873 (0.0903) loss: 0.8108 (0.8143) time: 0.2685 data: 0.0002 max mem: 26157 Train: [31] [6249/6250] eta: 0:00:00 lr: 0.000102 grad: 0.0847 (0.0903) loss: 0.8193 (0.8143) time: 0.2691 data: 0.0002 max mem: 26157 Train: [31] Total time: 0:28:25 (0.2729 s / it) Averaged stats: lr: 0.000102 grad: 0.0847 (0.0903) loss: 0.8193 (0.8143) Eval (hcp-train-subset): [31] [ 0/62] eta: 0:05:13 loss: 0.8341 (0.8341) time: 5.0551 data: 4.9714 max mem: 26157 Eval (hcp-train-subset): [31] [61/62] eta: 0:00:00 loss: 0.8277 (0.8310) time: 0.1284 data: 0.0457 max mem: 26157 Eval (hcp-train-subset): [31] Total time: 0:00:12 (0.2091 s / it) Averaged stats (hcp-train-subset): loss: 0.8277 (0.8310) Making plots (hcp-train-subset): example=2 Eval (hcp-val): [31] [ 0/62] eta: 0:04:15 loss: 0.8236 (0.8236) time: 4.1175 data: 4.0128 max mem: 26157 Eval (hcp-val): [31] [61/62] eta: 0:00:00 loss: 0.8279 (0.8295) time: 0.1425 data: 0.0598 max mem: 26157 Eval (hcp-val): [31] Total time: 0:00:13 (0.2207 s / it) Averaged stats (hcp-val): loss: 0.8279 (0.8295) Making plots (hcp-val): example=41 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [32] [ 0/6250] eta: 7:57:20 lr: 0.000102 grad: 0.0622 (0.0622) loss: 0.8541 (0.8541) time: 4.5825 data: 4.2404 max mem: 26157 Train: [32] [ 100/6250] eta: 0:33:38 lr: 0.000102 grad: 0.0943 (0.0992) loss: 0.8256 (0.8337) time: 0.2717 data: 0.0002 max mem: 26157 Train: [32] [ 200/6250] eta: 0:30:11 lr: 0.000102 grad: 0.0785 (0.1011) loss: 0.8267 (0.8257) time: 0.2690 data: 0.0002 max mem: 26157 Train: [32] [ 300/6250] eta: 0:28:43 lr: 0.000102 grad: 0.0825 (0.0967) loss: 0.8219 (0.8243) time: 0.2707 data: 0.0002 max mem: 26157 Train: [32] [ 400/6250] eta: 0:27:47 lr: 0.000102 grad: 0.0839 (0.0947) loss: 0.8176 (0.8224) time: 0.2696 data: 0.0002 max mem: 26157 Train: [32] [ 500/6250] eta: 0:27:02 lr: 0.000102 grad: 0.0789 (0.0935) loss: 0.8165 (0.8212) time: 0.2691 data: 0.0002 max mem: 26157 Train: [32] [ 600/6250] eta: 0:26:25 lr: 0.000102 grad: 0.0832 (0.0925) loss: 0.8231 (0.8207) time: 0.2753 data: 0.0003 max mem: 26157 Train: [32] [ 700/6250] eta: 0:25:49 lr: 0.000102 grad: 0.0824 (0.0916) loss: 0.8208 (0.8204) time: 0.2714 data: 0.0002 max mem: 26157 Train: [32] [ 800/6250] eta: 0:25:16 lr: 0.000101 grad: 0.0816 (0.0907) loss: 0.8186 (0.8202) time: 0.2729 data: 0.0002 max mem: 26157 Train: [32] [ 900/6250] eta: 0:24:44 lr: 0.000101 grad: 0.0836 (0.0900) loss: 0.8179 (0.8201) time: 0.2717 data: 0.0002 max mem: 26157 Train: [32] [1000/6250] eta: 0:25:04 lr: 0.000101 grad: 0.0830 (0.0895) loss: 0.8099 (0.8197) time: 0.7604 data: 0.4879 max mem: 26157 Train: [32] [1100/6250] eta: 0:24:28 lr: 0.000101 grad: 0.0825 (0.0892) loss: 0.8180 (0.8195) time: 0.2709 data: 0.0002 max mem: 26157 Train: [32] [1200/6250] eta: 0:23:53 lr: 0.000101 grad: 0.0854 (0.0891) loss: 0.8133 (0.8191) time: 0.2692 data: 0.0002 max mem: 26157 Train: [32] [1300/6250] eta: 0:23:20 lr: 0.000101 grad: 0.0862 (0.0890) loss: 0.8167 (0.8189) time: 0.2686 data: 0.0002 max mem: 26157 Train: [32] [1400/6250] eta: 0:22:47 lr: 0.000101 grad: 0.0908 (0.0890) loss: 0.8118 (0.8185) time: 0.2689 data: 0.0002 max mem: 26157 Train: [32] [1500/6250] eta: 0:22:15 lr: 0.000101 grad: 0.0855 (0.0891) loss: 0.8139 (0.8181) time: 0.2699 data: 0.0002 max mem: 26157 Train: [32] [1600/6250] eta: 0:21:43 lr: 0.000101 grad: 0.0900 (0.0894) loss: 0.7993 (0.8175) time: 0.2692 data: 0.0002 max mem: 26157 Train: [32] [1700/6250] eta: 0:21:12 lr: 0.000101 grad: 0.0824 (0.0894) loss: 0.8111 (0.8170) time: 0.2711 data: 0.0002 max mem: 26157 Train: [32] [1800/6250] eta: 0:20:42 lr: 0.000101 grad: 0.0874 (0.0895) loss: 0.8114 (0.8166) time: 0.2690 data: 0.0002 max mem: 26157 Train: [32] [1900/6250] eta: 0:20:11 lr: 0.000101 grad: 0.0931 (0.0897) loss: 0.8135 (0.8164) time: 0.2678 data: 0.0002 max mem: 26157 Train: [32] [2000/6250] eta: 0:19:41 lr: 0.000101 grad: 0.0882 (0.0898) loss: 0.8074 (0.8161) time: 0.2681 data: 0.0002 max mem: 26157 Train: [32] [2100/6250] eta: 0:19:12 lr: 0.000101 grad: 0.0884 (0.0898) loss: 0.8140 (0.8159) time: 0.2703 data: 0.0002 max mem: 26157 Train: [32] [2200/6250] eta: 0:18:43 lr: 0.000101 grad: 0.0911 (0.0899) loss: 0.8106 (0.8156) time: 0.2762 data: 0.0002 max mem: 26157 Train: [32] [2300/6250] eta: 0:18:18 lr: 0.000101 grad: 0.0911 (0.0900) loss: 0.8150 (0.8155) time: 0.2705 data: 0.0002 max mem: 26157 Train: [32] [2400/6250] eta: 0:17:50 lr: 0.000101 grad: 0.0866 (0.0902) loss: 0.8103 (0.8153) time: 0.2685 data: 0.0002 max mem: 26157 Train: [32] [2500/6250] eta: 0:17:21 lr: 0.000101 grad: 0.0885 (0.0902) loss: 0.8096 (0.8151) time: 0.2682 data: 0.0002 max mem: 26157 Train: [32] [2600/6250] eta: 0:16:52 lr: 0.000101 grad: 0.0876 (0.0904) loss: 0.8066 (0.8147) time: 0.2690 data: 0.0002 max mem: 26157 Train: [32] [2700/6250] eta: 0:16:23 lr: 0.000101 grad: 0.0909 (0.0905) loss: 0.8073 (0.8144) time: 0.2690 data: 0.0003 max mem: 26157 Train: [32] [2800/6250] eta: 0:15:54 lr: 0.000101 grad: 0.0948 (0.0907) loss: 0.8083 (0.8143) time: 0.2678 data: 0.0002 max mem: 26157 Train: [32] [2900/6250] eta: 0:15:26 lr: 0.000101 grad: 0.0937 (0.0907) loss: 0.8075 (0.8141) time: 0.2705 data: 0.0003 max mem: 26157 Train: [32] [3000/6250] eta: 0:14:57 lr: 0.000101 grad: 0.0864 (0.0907) loss: 0.8130 (0.8141) time: 0.2682 data: 0.0002 max mem: 26157 Train: [32] [3100/6250] eta: 0:14:29 lr: 0.000101 grad: 0.0901 (0.0907) loss: 0.8088 (0.8139) time: 0.2698 data: 0.0002 max mem: 26157 Train: [32] [3200/6250] eta: 0:14:01 lr: 0.000101 grad: 0.0903 (0.0908) loss: 0.8170 (0.8138) time: 0.2693 data: 0.0002 max mem: 26157 Train: [32] [3300/6250] eta: 0:13:32 lr: 0.000101 grad: 0.0894 (0.0909) loss: 0.8113 (0.8136) time: 0.2674 data: 0.0002 max mem: 26157 Train: [32] [3400/6250] eta: 0:13:04 lr: 0.000101 grad: 0.0897 (0.0911) loss: 0.8072 (0.8135) time: 0.2690 data: 0.0002 max mem: 26157 Train: [32] [3500/6250] eta: 0:12:36 lr: 0.000101 grad: 0.0901 (0.0911) loss: 0.8084 (0.8133) time: 0.2697 data: 0.0002 max mem: 26157 Train: [32] [3600/6250] eta: 0:12:10 lr: 0.000101 grad: 0.0915 (0.0911) loss: 0.8075 (0.8131) time: 0.2701 data: 0.0002 max mem: 26157 Train: [32] [3700/6250] eta: 0:11:42 lr: 0.000101 grad: 0.0889 (0.0913) loss: 0.8041 (0.8130) time: 0.2687 data: 0.0002 max mem: 26157 Train: [32] [3800/6250] eta: 0:11:14 lr: 0.000101 grad: 0.0962 (0.0913) loss: 0.8038 (0.8128) time: 0.2702 data: 0.0002 max mem: 26157 Train: [32] [3900/6250] eta: 0:10:46 lr: 0.000101 grad: 0.0906 (0.0913) loss: 0.8038 (0.8127) time: 0.2705 data: 0.0002 max mem: 26157 Train: [32] [4000/6250] eta: 0:10:18 lr: 0.000101 grad: 0.0938 (0.0913) loss: 0.8027 (0.8126) time: 0.2703 data: 0.0002 max mem: 26157 Train: [32] [4100/6250] eta: 0:09:50 lr: 0.000101 grad: 0.0927 (0.0914) loss: 0.8043 (0.8125) time: 0.2698 data: 0.0002 max mem: 26157 Train: [32] [4200/6250] eta: 0:09:25 lr: 0.000101 grad: 0.0888 (0.0914) loss: 0.8104 (0.8124) time: 0.4731 data: 0.1966 max mem: 26157 Train: [32] [4300/6250] eta: 0:08:57 lr: 0.000101 grad: 0.0914 (0.0914) loss: 0.8173 (0.8123) time: 0.2692 data: 0.0002 max mem: 26157 Train: [32] [4400/6250] eta: 0:08:29 lr: 0.000101 grad: 0.0930 (0.0914) loss: 0.8084 (0.8122) time: 0.2708 data: 0.0002 max mem: 26157 Train: [32] [4500/6250] eta: 0:08:01 lr: 0.000101 grad: 0.0889 (0.0915) loss: 0.8152 (0.8122) time: 0.2705 data: 0.0002 max mem: 26157 Train: [32] [4600/6250] eta: 0:07:36 lr: 0.000101 grad: 0.0912 (0.0915) loss: 0.8111 (0.8122) time: 0.5663 data: 0.2879 max mem: 26157 Train: [32] [4700/6250] eta: 0:07:12 lr: 0.000100 grad: 0.0865 (0.0916) loss: 0.8116 (0.8121) time: 0.2720 data: 0.0002 max mem: 26157 Train: [32] [4800/6250] eta: 0:06:44 lr: 0.000100 grad: 0.0939 (0.0917) loss: 0.8137 (0.8121) time: 0.2710 data: 0.0002 max mem: 26157 Train: [32] [4900/6250] eta: 0:06:16 lr: 0.000100 grad: 0.0892 (0.0917) loss: 0.8171 (0.8122) time: 0.2688 data: 0.0002 max mem: 26157 Train: [32] [5000/6250] eta: 0:05:48 lr: 0.000100 grad: 0.0861 (0.0918) loss: 0.8153 (0.8122) time: 0.2687 data: 0.0002 max mem: 26157 Train: [32] [5100/6250] eta: 0:05:20 lr: 0.000100 grad: 0.0929 (0.0918) loss: 0.8168 (0.8122) time: 0.2684 data: 0.0002 max mem: 26157 Train: [32] [5200/6250] eta: 0:04:53 lr: 0.000100 grad: 0.0896 (0.0918) loss: 0.8133 (0.8122) time: 0.2740 data: 0.0002 max mem: 26157 Train: [32] [5300/6250] eta: 0:04:25 lr: 0.000100 grad: 0.0869 (0.0919) loss: 0.8133 (0.8122) time: 0.2696 data: 0.0002 max mem: 26157 Train: [32] [5400/6250] eta: 0:03:57 lr: 0.000100 grad: 0.0907 (0.0919) loss: 0.8067 (0.8122) time: 0.2707 data: 0.0002 max mem: 26157 Train: [32] [5500/6250] eta: 0:03:29 lr: 0.000100 grad: 0.0928 (0.0920) loss: 0.8111 (0.8122) time: 0.2700 data: 0.0002 max mem: 26157 Train: [32] [5600/6250] eta: 0:03:01 lr: 0.000100 grad: 0.0903 (0.0921) loss: 0.8103 (0.8122) time: 0.2682 data: 0.0002 max mem: 26157 Train: [32] [5700/6250] eta: 0:02:33 lr: 0.000100 grad: 0.0889 (0.0921) loss: 0.8111 (0.8121) time: 0.2682 data: 0.0002 max mem: 26157 Train: [32] [5800/6250] eta: 0:02:05 lr: 0.000100 grad: 0.0904 (0.0922) loss: 0.8163 (0.8122) time: 0.2686 data: 0.0002 max mem: 26157 Train: [32] [5900/6250] eta: 0:01:37 lr: 0.000100 grad: 0.0879 (0.0921) loss: 0.8090 (0.8121) time: 0.2701 data: 0.0002 max mem: 26157 Train: [32] [6000/6250] eta: 0:01:09 lr: 0.000100 grad: 0.0932 (0.0922) loss: 0.8058 (0.8121) time: 0.2703 data: 0.0002 max mem: 26157 Train: [32] [6100/6250] eta: 0:00:41 lr: 0.000100 grad: 0.0895 (0.0922) loss: 0.8161 (0.8121) time: 0.2711 data: 0.0002 max mem: 26157 Train: [32] [6200/6250] eta: 0:00:13 lr: 0.000100 grad: 0.0857 (0.0921) loss: 0.8116 (0.8121) time: 0.2689 data: 0.0002 max mem: 26157 Train: [32] [6249/6250] eta: 0:00:00 lr: 0.000100 grad: 0.0859 (0.0921) loss: 0.8080 (0.8121) time: 0.2685 data: 0.0002 max mem: 26157 Train: [32] Total time: 0:29:09 (0.2799 s / it) Averaged stats: lr: 0.000100 grad: 0.0859 (0.0921) loss: 0.8080 (0.8121) Eval (hcp-train-subset): [32] [ 0/62] eta: 0:05:18 loss: 0.8441 (0.8441) time: 5.1322 data: 5.0481 max mem: 26157 Eval (hcp-train-subset): [32] [61/62] eta: 0:00:00 loss: 0.8316 (0.8327) time: 0.1386 data: 0.0557 max mem: 26157 Eval (hcp-train-subset): [32] Total time: 0:00:13 (0.2154 s / it) Averaged stats (hcp-train-subset): loss: 0.8316 (0.8327) Making plots (hcp-train-subset): example=40 Eval (hcp-val): [32] [ 0/62] eta: 0:05:04 loss: 0.8283 (0.8283) time: 4.9150 data: 4.8304 max mem: 26157 Eval (hcp-val): [32] [61/62] eta: 0:00:00 loss: 0.8265 (0.8292) time: 0.1417 data: 0.0589 max mem: 26157 Eval (hcp-val): [32] Total time: 0:00:13 (0.2192 s / it) Averaged stats (hcp-val): loss: 0.8265 (0.8292) Making plots (hcp-val): example=10 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [33] [ 0/6250] eta: 8:57:33 lr: 0.000100 grad: 0.0820 (0.0820) loss: 0.8649 (0.8649) time: 5.1606 data: 4.7796 max mem: 26157 Train: [33] [ 100/6250] eta: 0:33:46 lr: 0.000100 grad: 0.0953 (0.1131) loss: 0.8246 (0.8239) time: 0.2703 data: 0.0003 max mem: 26157 Train: [33] [ 200/6250] eta: 0:30:16 lr: 0.000100 grad: 0.0921 (0.1082) loss: 0.8252 (0.8201) time: 0.2713 data: 0.0002 max mem: 26157 Train: [33] [ 300/6250] eta: 0:28:49 lr: 0.000100 grad: 0.0964 (0.1055) loss: 0.8136 (0.8167) time: 0.2799 data: 0.0006 max mem: 26157 Train: [33] [ 400/6250] eta: 0:27:49 lr: 0.000100 grad: 0.0884 (0.1023) loss: 0.8236 (0.8162) time: 0.2699 data: 0.0001 max mem: 26157 Train: [33] [ 500/6250] eta: 0:27:03 lr: 0.000100 grad: 0.0862 (0.0993) loss: 0.8199 (0.8167) time: 0.2705 data: 0.0002 max mem: 26157 Train: [33] [ 600/6250] eta: 0:26:24 lr: 0.000100 grad: 0.0927 (0.0976) loss: 0.8191 (0.8169) time: 0.2694 data: 0.0002 max mem: 26157 Train: [33] [ 700/6250] eta: 0:25:47 lr: 0.000100 grad: 0.0860 (0.0964) loss: 0.8162 (0.8172) time: 0.2693 data: 0.0002 max mem: 26157 Train: [33] [ 800/6250] eta: 0:25:14 lr: 0.000100 grad: 0.0838 (0.0960) loss: 0.8201 (0.8174) time: 0.2704 data: 0.0002 max mem: 26157 Train: [33] [ 900/6250] eta: 0:24:41 lr: 0.000100 grad: 0.0852 (0.0953) loss: 0.8156 (0.8174) time: 0.2695 data: 0.0002 max mem: 26157 Train: [33] [1000/6250] eta: 0:24:09 lr: 0.000100 grad: 0.0881 (0.0944) loss: 0.8173 (0.8174) time: 0.2690 data: 0.0002 max mem: 26157 Train: [33] [1100/6250] eta: 0:23:38 lr: 0.000100 grad: 0.0836 (0.0940) loss: 0.8176 (0.8172) time: 0.2701 data: 0.0002 max mem: 26157 Train: [33] [1200/6250] eta: 0:23:09 lr: 0.000100 grad: 0.0913 (0.0942) loss: 0.8064 (0.8170) time: 0.2687 data: 0.0002 max mem: 26157 Train: [33] [1300/6250] eta: 0:22:39 lr: 0.000100 grad: 0.0918 (0.0940) loss: 0.8126 (0.8167) time: 0.2688 data: 0.0002 max mem: 26157 Train: [33] [1400/6250] eta: 0:22:09 lr: 0.000100 grad: 0.0931 (0.0937) loss: 0.8097 (0.8164) time: 0.2705 data: 0.0002 max mem: 26157 Train: [33] [1500/6250] eta: 0:21:41 lr: 0.000100 grad: 0.0873 (0.0938) loss: 0.8117 (0.8159) time: 0.2702 data: 0.0002 max mem: 26157 Train: [33] [1600/6250] eta: 0:21:12 lr: 0.000100 grad: 0.0914 (0.0936) loss: 0.8132 (0.8157) time: 0.2694 data: 0.0002 max mem: 26157 Train: [33] [1700/6250] eta: 0:20:47 lr: 0.000100 grad: 0.0909 (0.0935) loss: 0.8116 (0.8154) time: 0.2700 data: 0.0002 max mem: 26157 Train: [33] [1800/6250] eta: 0:20:33 lr: 0.000100 grad: 0.0961 (0.0935) loss: 0.8105 (0.8152) time: 0.4895 data: 0.2163 max mem: 26157 Train: [33] [1900/6250] eta: 0:20:03 lr: 0.000100 grad: 0.0886 (0.0937) loss: 0.8180 (0.8150) time: 0.2702 data: 0.0002 max mem: 26157 Train: [33] [2000/6250] eta: 0:19:34 lr: 0.000100 grad: 0.0938 (0.0937) loss: 0.8092 (0.8148) time: 0.2707 data: 0.0002 max mem: 26157 Train: [33] [2100/6250] eta: 0:19:05 lr: 0.000100 grad: 0.0885 (0.0936) loss: 0.8144 (0.8147) time: 0.2682 data: 0.0002 max mem: 26157 Train: [33] [2200/6250] eta: 0:18:36 lr: 0.000099 grad: 0.0884 (0.0935) loss: 0.8071 (0.8144) time: 0.2690 data: 0.0002 max mem: 26157 Train: [33] [2300/6250] eta: 0:18:17 lr: 0.000099 grad: 0.1017 (0.0936) loss: 0.8049 (0.8142) time: 0.2712 data: 0.0002 max mem: 26157 Train: [33] [2400/6250] eta: 0:17:48 lr: 0.000099 grad: 0.0946 (0.0936) loss: 0.8083 (0.8141) time: 0.2721 data: 0.0002 max mem: 26157 Train: [33] [2500/6250] eta: 0:17:22 lr: 0.000099 grad: 0.0827 (0.0935) loss: 0.8115 (0.8140) time: 0.2700 data: 0.0002 max mem: 26157 Train: [33] [2600/6250] eta: 0:16:54 lr: 0.000099 grad: 0.0906 (0.0936) loss: 0.8109 (0.8139) time: 0.2685 data: 0.0002 max mem: 26157 Train: [33] [2700/6250] eta: 0:16:25 lr: 0.000099 grad: 0.0922 (0.0935) loss: 0.8059 (0.8138) time: 0.2687 data: 0.0001 max mem: 26157 Train: [33] [2800/6250] eta: 0:16:07 lr: 0.000099 grad: 0.0941 (0.0935) loss: 0.8036 (0.8137) time: 0.2706 data: 0.0002 max mem: 26157 Train: [33] [2900/6250] eta: 0:15:38 lr: 0.000099 grad: 0.0886 (0.0935) loss: 0.8079 (0.8136) time: 0.2717 data: 0.0002 max mem: 26157 Train: [33] [3000/6250] eta: 0:15:09 lr: 0.000099 grad: 0.0919 (0.0936) loss: 0.8083 (0.8135) time: 0.2703 data: 0.0002 max mem: 26157 Train: [33] [3100/6250] eta: 0:14:40 lr: 0.000099 grad: 0.0965 (0.0936) loss: 0.8095 (0.8134) time: 0.2691 data: 0.0002 max mem: 26157 Train: [33] [3200/6250] eta: 0:14:11 lr: 0.000099 grad: 0.0955 (0.0940) loss: 0.8101 (0.8133) time: 0.2690 data: 0.0002 max mem: 26157 Train: [33] [3300/6250] eta: 0:13:42 lr: 0.000099 grad: 0.0857 (0.0940) loss: 0.8130 (0.8132) time: 0.2719 data: 0.0002 max mem: 26157 Train: [33] [3400/6250] eta: 0:13:13 lr: 0.000099 grad: 0.0876 (0.0940) loss: 0.8167 (0.8131) time: 0.2690 data: 0.0002 max mem: 26157 Train: [33] [3500/6250] eta: 0:12:45 lr: 0.000099 grad: 0.0906 (0.0939) loss: 0.8129 (0.8131) time: 0.2686 data: 0.0002 max mem: 26157 Train: [33] [3600/6250] eta: 0:12:16 lr: 0.000099 grad: 0.0944 (0.0940) loss: 0.8071 (0.8129) time: 0.2687 data: 0.0002 max mem: 26157 Train: [33] [3700/6250] eta: 0:11:48 lr: 0.000099 grad: 0.0920 (0.0940) loss: 0.8103 (0.8128) time: 0.2700 data: 0.0002 max mem: 26157 Train: [33] [3800/6250] eta: 0:11:23 lr: 0.000099 grad: 0.0923 (0.0939) loss: 0.8129 (0.8127) time: 0.3133 data: 0.0410 max mem: 26157 Train: [33] [3900/6250] eta: 0:10:54 lr: 0.000099 grad: 0.0921 (0.0940) loss: 0.8086 (0.8126) time: 0.2688 data: 0.0002 max mem: 26157 Train: [33] [4000/6250] eta: 0:10:26 lr: 0.000099 grad: 0.0854 (0.0939) loss: 0.8115 (0.8125) time: 0.2678 data: 0.0002 max mem: 26157 Train: [33] [4100/6250] eta: 0:09:58 lr: 0.000099 grad: 0.0877 (0.0941) loss: 0.8131 (0.8124) time: 0.2686 data: 0.0002 max mem: 26157 Train: [33] [4200/6250] eta: 0:09:29 lr: 0.000099 grad: 0.0857 (0.0941) loss: 0.8133 (0.8124) time: 0.2684 data: 0.0002 max mem: 26157 Train: [33] [4300/6250] eta: 0:09:01 lr: 0.000099 grad: 0.0903 (0.0941) loss: 0.8063 (0.8123) time: 0.2691 data: 0.0002 max mem: 26157 Train: [33] [4400/6250] eta: 0:08:33 lr: 0.000099 grad: 0.0927 (0.0940) loss: 0.8104 (0.8123) time: 0.2695 data: 0.0002 max mem: 26157 Train: [33] [4500/6250] eta: 0:08:05 lr: 0.000099 grad: 0.0928 (0.0941) loss: 0.8129 (0.8123) time: 0.2692 data: 0.0002 max mem: 26157 Train: [33] [4600/6250] eta: 0:07:37 lr: 0.000099 grad: 0.0925 (0.0942) loss: 0.8106 (0.8122) time: 0.2700 data: 0.0002 max mem: 26157 Train: [33] [4700/6250] eta: 0:07:09 lr: 0.000099 grad: 0.0895 (0.0942) loss: 0.8095 (0.8122) time: 0.2702 data: 0.0002 max mem: 26157 Train: [33] [4800/6250] eta: 0:06:41 lr: 0.000099 grad: 0.0878 (0.0942) loss: 0.8086 (0.8121) time: 0.2697 data: 0.0002 max mem: 26157 Train: [33] [4900/6250] eta: 0:06:13 lr: 0.000099 grad: 0.0898 (0.0943) loss: 0.8055 (0.8120) time: 0.2695 data: 0.0002 max mem: 26157 Train: [33] [5000/6250] eta: 0:05:45 lr: 0.000099 grad: 0.0954 (0.0943) loss: 0.8056 (0.8120) time: 0.2691 data: 0.0002 max mem: 26157 Train: [33] [5100/6250] eta: 0:05:17 lr: 0.000099 grad: 0.0890 (0.0942) loss: 0.8134 (0.8120) time: 0.2694 data: 0.0002 max mem: 26157 Train: [33] [5200/6250] eta: 0:04:50 lr: 0.000099 grad: 0.0866 (0.0942) loss: 0.8095 (0.8119) time: 0.2709 data: 0.0002 max mem: 26157 Train: [33] [5300/6250] eta: 0:04:22 lr: 0.000099 grad: 0.0925 (0.0943) loss: 0.8010 (0.8119) time: 0.2698 data: 0.0002 max mem: 26157 Train: [33] [5400/6250] eta: 0:03:54 lr: 0.000099 grad: 0.0943 (0.0943) loss: 0.8108 (0.8118) time: 0.2775 data: 0.0002 max mem: 26157 Train: [33] [5500/6250] eta: 0:03:27 lr: 0.000099 grad: 0.0894 (0.0943) loss: 0.8116 (0.8118) time: 0.4104 data: 0.1330 max mem: 26157 Train: [33] [5600/6250] eta: 0:02:59 lr: 0.000099 grad: 0.0937 (0.0943) loss: 0.8112 (0.8117) time: 0.2686 data: 0.0001 max mem: 26157 Train: [33] [5700/6250] eta: 0:02:32 lr: 0.000099 grad: 0.0936 (0.0943) loss: 0.8087 (0.8117) time: 0.2690 data: 0.0002 max mem: 26157 Train: [33] [5800/6250] eta: 0:02:04 lr: 0.000099 grad: 0.0952 (0.0944) loss: 0.8011 (0.8115) time: 0.2692 data: 0.0002 max mem: 26157 Train: [33] [5900/6250] eta: 0:01:36 lr: 0.000098 grad: 0.0913 (0.0944) loss: 0.8073 (0.8115) time: 0.2691 data: 0.0002 max mem: 26157 Train: [33] [6000/6250] eta: 0:01:09 lr: 0.000098 grad: 0.0888 (0.0944) loss: 0.8134 (0.8115) time: 0.2718 data: 0.0002 max mem: 26157 Train: [33] [6100/6250] eta: 0:00:41 lr: 0.000098 grad: 0.0902 (0.0945) loss: 0.8080 (0.8114) time: 0.2732 data: 0.0002 max mem: 26157 Train: [33] [6200/6250] eta: 0:00:13 lr: 0.000098 grad: 0.0889 (0.0944) loss: 0.8164 (0.8114) time: 0.2697 data: 0.0002 max mem: 26157 Train: [33] [6249/6250] eta: 0:00:00 lr: 0.000098 grad: 0.0889 (0.0944) loss: 0.8102 (0.8114) time: 0.2727 data: 0.0002 max mem: 26157 Train: [33] Total time: 0:28:52 (0.2771 s / it) Averaged stats: lr: 0.000098 grad: 0.0889 (0.0944) loss: 0.8102 (0.8114) Eval (hcp-train-subset): [33] [ 0/62] eta: 0:06:08 loss: 0.8435 (0.8435) time: 5.9364 data: 5.8518 max mem: 26157 Eval (hcp-train-subset): [33] [61/62] eta: 0:00:00 loss: 0.8254 (0.8282) time: 0.1295 data: 0.0447 max mem: 26157 Eval (hcp-train-subset): [33] Total time: 0:00:13 (0.2143 s / it) Averaged stats (hcp-train-subset): loss: 0.8254 (0.8282) Making plots (hcp-train-subset): example=47 Eval (hcp-val): [33] [ 0/62] eta: 0:03:39 loss: 0.8238 (0.8238) time: 3.5374 data: 3.4269 max mem: 26157 Eval (hcp-val): [33] [61/62] eta: 0:00:00 loss: 0.8274 (0.8283) time: 0.1324 data: 0.0489 max mem: 26157 Eval (hcp-val): [33] Total time: 0:00:12 (0.2074 s / it) Averaged stats (hcp-val): loss: 0.8274 (0.8283) Making plots (hcp-val): example=26 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [34] [ 0/6250] eta: 12:26:52 lr: 0.000098 grad: 0.1531 (0.1531) loss: 0.7917 (0.7917) time: 7.1700 data: 6.8870 max mem: 26157 Train: [34] [ 100/6250] eta: 0:34:53 lr: 0.000098 grad: 0.0866 (0.0906) loss: 0.8311 (0.8316) time: 0.2724 data: 0.0002 max mem: 26157 Train: [34] [ 200/6250] eta: 0:30:49 lr: 0.000098 grad: 0.0896 (0.0894) loss: 0.8195 (0.8274) time: 0.2703 data: 0.0002 max mem: 26157 Train: [34] [ 300/6250] eta: 0:29:07 lr: 0.000098 grad: 0.0869 (0.0897) loss: 0.8247 (0.8244) time: 0.2708 data: 0.0002 max mem: 26157 Train: [34] [ 400/6250] eta: 0:28:08 lr: 0.000098 grad: 0.0864 (0.0894) loss: 0.8162 (0.8230) time: 0.2706 data: 0.0002 max mem: 26157 Train: [34] [ 500/6250] eta: 0:27:17 lr: 0.000098 grad: 0.0881 (0.0890) loss: 0.8170 (0.8220) time: 0.2699 data: 0.0002 max mem: 26157 Train: [34] [ 600/6250] eta: 0:26:35 lr: 0.000098 grad: 0.0802 (0.0884) loss: 0.8226 (0.8216) time: 0.2718 data: 0.0002 max mem: 26157 Train: [34] [ 700/6250] eta: 0:25:58 lr: 0.000098 grad: 0.0793 (0.0877) loss: 0.8184 (0.8215) time: 0.2698 data: 0.0002 max mem: 26157 Train: [34] [ 800/6250] eta: 0:25:22 lr: 0.000098 grad: 0.0787 (0.0872) loss: 0.8233 (0.8216) time: 0.2705 data: 0.0002 max mem: 26157 Train: [34] [ 900/6250] eta: 0:24:49 lr: 0.000098 grad: 0.0848 (0.0871) loss: 0.8160 (0.8214) time: 0.2681 data: 0.0001 max mem: 26157 Train: [34] [1000/6250] eta: 0:24:43 lr: 0.000098 grad: 0.0868 (0.0871) loss: 0.8256 (0.8213) time: 0.5099 data: 0.2354 max mem: 26157 Train: [34] [1100/6250] eta: 0:24:16 lr: 0.000098 grad: 0.0815 (0.0870) loss: 0.8190 (0.8209) time: 0.3351 data: 0.0641 max mem: 26157 Train: [34] [1200/6250] eta: 0:23:50 lr: 0.000098 grad: 0.0795 (0.0870) loss: 0.8152 (0.8205) time: 0.2713 data: 0.0002 max mem: 26157 Train: [34] [1300/6250] eta: 0:23:17 lr: 0.000098 grad: 0.0815 (0.0870) loss: 0.8175 (0.8202) time: 0.2709 data: 0.0002 max mem: 26157 Train: [34] [1400/6250] eta: 0:22:52 lr: 0.000098 grad: 0.0866 (0.0872) loss: 0.8182 (0.8199) time: 0.3804 data: 0.1085 max mem: 26157 Train: [34] [1500/6250] eta: 0:22:20 lr: 0.000098 grad: 0.0847 (0.0872) loss: 0.8075 (0.8195) time: 0.2686 data: 0.0002 max mem: 26157 Train: [34] [1600/6250] eta: 0:21:48 lr: 0.000098 grad: 0.0868 (0.0874) loss: 0.8078 (0.8189) time: 0.2691 data: 0.0002 max mem: 26157 Train: [34] [1700/6250] eta: 0:21:16 lr: 0.000098 grad: 0.0897 (0.0876) loss: 0.8128 (0.8185) time: 0.2701 data: 0.0002 max mem: 26157 Train: [34] [1800/6250] eta: 0:20:46 lr: 0.000098 grad: 0.0935 (0.0878) loss: 0.8110 (0.8181) time: 0.2706 data: 0.0002 max mem: 26157 Train: [34] [1900/6250] eta: 0:20:16 lr: 0.000098 grad: 0.0945 (0.0883) loss: 0.8100 (0.8176) time: 0.2690 data: 0.0002 max mem: 26157 Train: [34] [2000/6250] eta: 0:19:45 lr: 0.000098 grad: 0.0878 (0.0886) loss: 0.8068 (0.8172) time: 0.2693 data: 0.0002 max mem: 26157 Train: [34] [2100/6250] eta: 0:19:16 lr: 0.000098 grad: 0.0958 (0.0889) loss: 0.8110 (0.8167) time: 0.2702 data: 0.0002 max mem: 26157 Train: [34] [2200/6250] eta: 0:18:47 lr: 0.000098 grad: 0.0935 (0.0893) loss: 0.8098 (0.8163) time: 0.2746 data: 0.0002 max mem: 26157 Train: [34] [2300/6250] eta: 0:18:18 lr: 0.000098 grad: 0.0912 (0.0894) loss: 0.8037 (0.8160) time: 0.2688 data: 0.0002 max mem: 26157 Train: [34] [2400/6250] eta: 0:17:48 lr: 0.000098 grad: 0.0934 (0.0897) loss: 0.8097 (0.8156) time: 0.2687 data: 0.0002 max mem: 26157 Train: [34] [2500/6250] eta: 0:17:20 lr: 0.000098 grad: 0.0899 (0.0899) loss: 0.8086 (0.8153) time: 0.2689 data: 0.0002 max mem: 26157 Train: [34] [2600/6250] eta: 0:16:51 lr: 0.000098 grad: 0.0967 (0.0902) loss: 0.8008 (0.8149) time: 0.2718 data: 0.0002 max mem: 26157 Train: [34] [2700/6250] eta: 0:16:22 lr: 0.000098 grad: 0.0947 (0.0904) loss: 0.8040 (0.8147) time: 0.2701 data: 0.0002 max mem: 26157 Train: [34] [2800/6250] eta: 0:15:54 lr: 0.000098 grad: 0.0937 (0.0906) loss: 0.8092 (0.8144) time: 0.2685 data: 0.0002 max mem: 26157 Train: [34] [2900/6250] eta: 0:15:25 lr: 0.000098 grad: 0.0958 (0.0909) loss: 0.8040 (0.8142) time: 0.2682 data: 0.0001 max mem: 26157 Train: [34] [3000/6250] eta: 0:14:57 lr: 0.000098 grad: 0.0859 (0.0911) loss: 0.8157 (0.8141) time: 0.2692 data: 0.0002 max mem: 26157 Train: [34] [3100/6250] eta: 0:14:29 lr: 0.000098 grad: 0.0950 (0.0912) loss: 0.8103 (0.8140) time: 0.2704 data: 0.0002 max mem: 26157 Train: [34] [3200/6250] eta: 0:14:00 lr: 0.000098 grad: 0.0892 (0.0913) loss: 0.8112 (0.8139) time: 0.2699 data: 0.0002 max mem: 26157 Train: [34] [3300/6250] eta: 0:13:33 lr: 0.000097 grad: 0.0941 (0.0914) loss: 0.8076 (0.8138) time: 0.2863 data: 0.0167 max mem: 26157 Train: [34] [3400/6250] eta: 0:13:04 lr: 0.000097 grad: 0.0967 (0.0916) loss: 0.8089 (0.8137) time: 0.2693 data: 0.0002 max mem: 26157 Train: [34] [3500/6250] eta: 0:12:38 lr: 0.000097 grad: 0.0911 (0.0916) loss: 0.8152 (0.8136) time: 0.2694 data: 0.0002 max mem: 26157 Train: [34] [3600/6250] eta: 0:12:10 lr: 0.000097 grad: 0.0920 (0.0918) loss: 0.8081 (0.8135) time: 0.2869 data: 0.0126 max mem: 26157 Train: [34] [3700/6250] eta: 0:11:42 lr: 0.000097 grad: 0.0922 (0.0919) loss: 0.8079 (0.8135) time: 0.2711 data: 0.0002 max mem: 26157 Train: [34] [3800/6250] eta: 0:11:14 lr: 0.000097 grad: 0.0876 (0.0921) loss: 0.8132 (0.8135) time: 0.2697 data: 0.0002 max mem: 26157 Train: [34] [3900/6250] eta: 0:10:50 lr: 0.000097 grad: 0.0937 (0.0923) loss: 0.8096 (0.8134) time: 0.5927 data: 0.3224 max mem: 26157 Train: [34] [4000/6250] eta: 0:10:22 lr: 0.000097 grad: 0.0921 (0.0925) loss: 0.8116 (0.8134) time: 0.2693 data: 0.0002 max mem: 26157 Train: [34] [4100/6250] eta: 0:09:54 lr: 0.000097 grad: 0.0880 (0.0925) loss: 0.8146 (0.8134) time: 0.2686 data: 0.0002 max mem: 26157 Train: [34] [4200/6250] eta: 0:09:26 lr: 0.000097 grad: 0.0968 (0.0925) loss: 0.8097 (0.8134) time: 0.2689 data: 0.0001 max mem: 26157 Train: [34] [4300/6250] eta: 0:08:58 lr: 0.000097 grad: 0.0965 (0.0926) loss: 0.8135 (0.8134) time: 0.2682 data: 0.0001 max mem: 26157 Train: [34] [4400/6250] eta: 0:08:30 lr: 0.000097 grad: 0.1024 (0.0927) loss: 0.8119 (0.8133) time: 0.2693 data: 0.0002 max mem: 26157 Train: [34] [4500/6250] eta: 0:08:02 lr: 0.000097 grad: 0.0983 (0.0930) loss: 0.8036 (0.8133) time: 0.2679 data: 0.0002 max mem: 26157 Train: [34] [4600/6250] eta: 0:07:35 lr: 0.000097 grad: 0.0898 (0.0931) loss: 0.8143 (0.8132) time: 0.2691 data: 0.0002 max mem: 26157 Train: [34] [4700/6250] eta: 0:07:07 lr: 0.000097 grad: 0.0955 (0.0931) loss: 0.8138 (0.8132) time: 0.2690 data: 0.0001 max mem: 26157 Train: [34] [4800/6250] eta: 0:06:39 lr: 0.000097 grad: 0.0918 (0.0931) loss: 0.8140 (0.8132) time: 0.2705 data: 0.0002 max mem: 26157 Train: [34] [4900/6250] eta: 0:06:11 lr: 0.000097 grad: 0.0917 (0.0931) loss: 0.8141 (0.8132) time: 0.2707 data: 0.0002 max mem: 26157 Train: [34] [5000/6250] eta: 0:05:44 lr: 0.000097 grad: 0.0930 (0.0932) loss: 0.8115 (0.8133) time: 0.2698 data: 0.0002 max mem: 26157 Train: [34] [5100/6250] eta: 0:05:16 lr: 0.000097 grad: 0.0931 (0.0932) loss: 0.8156 (0.8133) time: 0.2695 data: 0.0002 max mem: 26157 Train: [34] [5200/6250] eta: 0:04:49 lr: 0.000097 grad: 0.0898 (0.0932) loss: 0.8192 (0.8134) time: 0.2705 data: 0.0002 max mem: 26157 Train: [34] [5300/6250] eta: 0:04:21 lr: 0.000097 grad: 0.0953 (0.0932) loss: 0.8148 (0.8134) time: 0.2689 data: 0.0002 max mem: 26157 Train: [34] [5400/6250] eta: 0:03:54 lr: 0.000097 grad: 0.0929 (0.0932) loss: 0.8162 (0.8134) time: 0.2693 data: 0.0002 max mem: 26157 Train: [34] [5500/6250] eta: 0:03:26 lr: 0.000097 grad: 0.0898 (0.0933) loss: 0.8160 (0.8135) time: 0.2699 data: 0.0002 max mem: 26157 Train: [34] [5600/6250] eta: 0:02:58 lr: 0.000097 grad: 0.0868 (0.0934) loss: 0.8159 (0.8135) time: 0.2709 data: 0.0002 max mem: 26157 Train: [34] [5700/6250] eta: 0:02:31 lr: 0.000097 grad: 0.0837 (0.0934) loss: 0.8150 (0.8135) time: 0.2706 data: 0.0002 max mem: 26157 Train: [34] [5800/6250] eta: 0:02:03 lr: 0.000097 grad: 0.0883 (0.0934) loss: 0.8095 (0.8135) time: 0.2711 data: 0.0002 max mem: 26157 Train: [34] [5900/6250] eta: 0:01:36 lr: 0.000097 grad: 0.0885 (0.0934) loss: 0.8144 (0.8135) time: 0.2705 data: 0.0002 max mem: 26157 Train: [34] [6000/6250] eta: 0:01:08 lr: 0.000097 grad: 0.0947 (0.0934) loss: 0.8179 (0.8136) time: 0.2692 data: 0.0002 max mem: 26157 Train: [34] [6100/6250] eta: 0:00:41 lr: 0.000097 grad: 0.0925 (0.0934) loss: 0.8173 (0.8136) time: 0.2691 data: 0.0002 max mem: 26157 Train: [34] [6200/6250] eta: 0:00:13 lr: 0.000097 grad: 0.0990 (0.0934) loss: 0.8121 (0.8136) time: 0.2680 data: 0.0001 max mem: 26157 Train: [34] [6249/6250] eta: 0:00:00 lr: 0.000097 grad: 0.0934 (0.0934) loss: 0.8127 (0.8137) time: 0.2696 data: 0.0002 max mem: 26157 Train: [34] Total time: 0:28:43 (0.2757 s / it) Averaged stats: lr: 0.000097 grad: 0.0934 (0.0934) loss: 0.8127 (0.8137) Eval (hcp-train-subset): [34] [ 0/62] eta: 0:04:52 loss: 0.8356 (0.8356) time: 4.7236 data: 4.6390 max mem: 26157 Eval (hcp-train-subset): [34] [61/62] eta: 0:00:00 loss: 0.8303 (0.8310) time: 0.1238 data: 0.0411 max mem: 26157 Eval (hcp-train-subset): [34] Total time: 0:00:12 (0.2080 s / it) Averaged stats (hcp-train-subset): loss: 0.8303 (0.8310) Making plots (hcp-train-subset): example=9 Eval (hcp-val): [34] [ 0/62] eta: 0:05:32 loss: 0.8221 (0.8221) time: 5.3688 data: 5.2847 max mem: 26157 Eval (hcp-val): [34] [61/62] eta: 0:00:00 loss: 0.8280 (0.8284) time: 0.1291 data: 0.0438 max mem: 26157 Eval (hcp-val): [34] Total time: 0:00:13 (0.2148 s / it) Averaged stats (hcp-val): loss: 0.8280 (0.8284) Making plots (hcp-val): example=60 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [35] [ 0/6250] eta: 7:21:51 lr: 0.000097 grad: 0.0647 (0.0647) loss: 0.8761 (0.8761) time: 4.2418 data: 3.8645 max mem: 26157 Train: [35] [ 100/6250] eta: 0:35:24 lr: 0.000097 grad: 0.0934 (0.1045) loss: 0.8261 (0.8268) time: 0.2687 data: 0.0002 max mem: 26157 Train: [35] [ 200/6250] eta: 0:31:02 lr: 0.000097 grad: 0.0974 (0.1022) loss: 0.8129 (0.8202) time: 0.2700 data: 0.0002 max mem: 26157 Train: [35] [ 300/6250] eta: 0:29:15 lr: 0.000097 grad: 0.0954 (0.1008) loss: 0.8104 (0.8179) time: 0.2699 data: 0.0002 max mem: 26157 Train: [35] [ 400/6250] eta: 0:28:10 lr: 0.000097 grad: 0.0942 (0.0988) loss: 0.8118 (0.8168) time: 0.2721 data: 0.0002 max mem: 26157 Train: [35] [ 500/6250] eta: 0:27:21 lr: 0.000097 grad: 0.0863 (0.0977) loss: 0.8134 (0.8162) time: 0.2706 data: 0.0002 max mem: 26157 Train: [35] [ 600/6250] eta: 0:26:38 lr: 0.000097 grad: 0.0896 (0.0967) loss: 0.8119 (0.8160) time: 0.2708 data: 0.0002 max mem: 26157 Train: [35] [ 700/6250] eta: 0:25:58 lr: 0.000096 grad: 0.0913 (0.0966) loss: 0.8139 (0.8154) time: 0.2690 data: 0.0002 max mem: 26157 Train: [35] [ 800/6250] eta: 0:25:23 lr: 0.000096 grad: 0.0872 (0.0962) loss: 0.8190 (0.8148) time: 0.2721 data: 0.0002 max mem: 26157 Train: [35] [ 900/6250] eta: 0:24:49 lr: 0.000096 grad: 0.0932 (0.0961) loss: 0.8082 (0.8144) time: 0.2690 data: 0.0002 max mem: 26157 Train: [35] [1000/6250] eta: 0:24:17 lr: 0.000096 grad: 0.0906 (0.0959) loss: 0.8140 (0.8141) time: 0.2699 data: 0.0002 max mem: 26157 Train: [35] [1100/6250] eta: 0:23:52 lr: 0.000096 grad: 0.0894 (0.0957) loss: 0.8116 (0.8137) time: 0.2723 data: 0.0002 max mem: 26157 Train: [35] [1200/6250] eta: 0:23:43 lr: 0.000096 grad: 0.0866 (0.0953) loss: 0.8070 (0.8133) time: 0.2719 data: 0.0002 max mem: 26157 Train: [35] [1300/6250] eta: 0:23:09 lr: 0.000096 grad: 0.0943 (0.0952) loss: 0.8094 (0.8128) time: 0.2678 data: 0.0002 max mem: 26157 Train: [35] [1400/6250] eta: 0:22:38 lr: 0.000096 grad: 0.0924 (0.0951) loss: 0.8112 (0.8126) time: 0.2721 data: 0.0003 max mem: 26157 Train: [35] [1500/6250] eta: 0:22:16 lr: 0.000096 grad: 0.0975 (0.0954) loss: 0.8113 (0.8123) time: 0.2718 data: 0.0002 max mem: 26157 Train: [35] [1600/6250] eta: 0:21:47 lr: 0.000096 grad: 0.0976 (0.0959) loss: 0.8054 (0.8120) time: 0.2691 data: 0.0001 max mem: 26157 Train: [35] [1700/6250] eta: 0:21:20 lr: 0.000096 grad: 0.0906 (0.0957) loss: 0.8172 (0.8119) time: 0.3516 data: 0.0814 max mem: 26157 Train: [35] [1800/6250] eta: 0:20:53 lr: 0.000096 grad: 0.0864 (0.0957) loss: 0.8086 (0.8118) time: 0.2695 data: 0.0002 max mem: 26157 Train: [35] [1900/6250] eta: 0:20:22 lr: 0.000096 grad: 0.0926 (0.0955) loss: 0.8063 (0.8116) time: 0.2727 data: 0.0002 max mem: 26157 Train: [35] [2000/6250] eta: 0:19:51 lr: 0.000096 grad: 0.0872 (0.0955) loss: 0.8124 (0.8116) time: 0.2676 data: 0.0002 max mem: 26157 Train: [35] [2100/6250] eta: 0:19:21 lr: 0.000096 grad: 0.0941 (0.0955) loss: 0.8180 (0.8115) time: 0.2673 data: 0.0002 max mem: 26157 Train: [35] [2200/6250] eta: 0:18:51 lr: 0.000096 grad: 0.0934 (0.0955) loss: 0.8137 (0.8114) time: 0.2688 data: 0.0002 max mem: 26157 Train: [35] [2300/6250] eta: 0:18:21 lr: 0.000096 grad: 0.0879 (0.0954) loss: 0.8139 (0.8114) time: 0.2680 data: 0.0002 max mem: 26157 Train: [35] [2400/6250] eta: 0:17:52 lr: 0.000096 grad: 0.0916 (0.0953) loss: 0.8072 (0.8113) time: 0.2694 data: 0.0002 max mem: 26157 Train: [35] [2500/6250] eta: 0:17:23 lr: 0.000096 grad: 0.0919 (0.0952) loss: 0.8102 (0.8112) time: 0.2684 data: 0.0002 max mem: 26157 Train: [35] [2600/6250] eta: 0:16:54 lr: 0.000096 grad: 0.0944 (0.0952) loss: 0.8074 (0.8110) time: 0.2725 data: 0.0003 max mem: 26157 Train: [35] [2700/6250] eta: 0:16:25 lr: 0.000096 grad: 0.0947 (0.0953) loss: 0.8048 (0.8109) time: 0.2715 data: 0.0002 max mem: 26157 Train: [35] [2800/6250] eta: 0:15:56 lr: 0.000096 grad: 0.0922 (0.0953) loss: 0.8080 (0.8109) time: 0.2718 data: 0.0002 max mem: 26157 Train: [35] [2900/6250] eta: 0:15:28 lr: 0.000096 grad: 0.0960 (0.0953) loss: 0.8129 (0.8109) time: 0.2688 data: 0.0002 max mem: 26157 Train: [35] [3000/6250] eta: 0:14:59 lr: 0.000096 grad: 0.0940 (0.0953) loss: 0.8089 (0.8108) time: 0.2685 data: 0.0002 max mem: 26157 Train: [35] [3100/6250] eta: 0:14:31 lr: 0.000096 grad: 0.0912 (0.0953) loss: 0.8112 (0.8108) time: 0.2702 data: 0.0002 max mem: 26157 Train: [35] [3200/6250] eta: 0:14:02 lr: 0.000096 grad: 0.0950 (0.0953) loss: 0.8029 (0.8107) time: 0.2691 data: 0.0002 max mem: 26157 Train: [35] [3300/6250] eta: 0:13:34 lr: 0.000096 grad: 0.0912 (0.0952) loss: 0.8134 (0.8108) time: 0.2688 data: 0.0002 max mem: 26157 Train: [35] [3400/6250] eta: 0:13:06 lr: 0.000096 grad: 0.0969 (0.0953) loss: 0.8072 (0.8108) time: 0.2698 data: 0.0002 max mem: 26157 Train: [35] [3500/6250] eta: 0:12:38 lr: 0.000096 grad: 0.0849 (0.0952) loss: 0.8128 (0.8108) time: 0.2685 data: 0.0001 max mem: 26157 Train: [35] [3600/6250] eta: 0:12:10 lr: 0.000096 grad: 0.0988 (0.0952) loss: 0.8134 (0.8108) time: 0.2687 data: 0.0002 max mem: 26157 Train: [35] [3700/6250] eta: 0:11:42 lr: 0.000096 grad: 0.0921 (0.0952) loss: 0.8092 (0.8108) time: 0.2690 data: 0.0002 max mem: 26157 Train: [35] [3800/6250] eta: 0:11:14 lr: 0.000096 grad: 0.0957 (0.0952) loss: 0.8033 (0.8108) time: 0.2679 data: 0.0002 max mem: 26157 Train: [35] [3900/6250] eta: 0:10:46 lr: 0.000096 grad: 0.0894 (0.0952) loss: 0.8122 (0.8107) time: 0.2702 data: 0.0002 max mem: 26157 Train: [35] [4000/6250] eta: 0:10:18 lr: 0.000096 grad: 0.0924 (0.0952) loss: 0.8095 (0.8107) time: 0.2703 data: 0.0002 max mem: 26157 Train: [35] [4100/6250] eta: 0:09:50 lr: 0.000096 grad: 0.0946 (0.0952) loss: 0.8099 (0.8106) time: 0.2709 data: 0.0002 max mem: 26157 Train: [35] [4200/6250] eta: 0:09:23 lr: 0.000096 grad: 0.0910 (0.0952) loss: 0.8090 (0.8106) time: 0.2683 data: 0.0001 max mem: 26157 Train: [35] [4300/6250] eta: 0:08:55 lr: 0.000095 grad: 0.0902 (0.0952) loss: 0.8126 (0.8105) time: 0.2686 data: 0.0001 max mem: 26157 Train: [35] [4400/6250] eta: 0:08:27 lr: 0.000095 grad: 0.0909 (0.0953) loss: 0.8064 (0.8105) time: 0.2710 data: 0.0002 max mem: 26157 Train: [35] [4500/6250] eta: 0:08:00 lr: 0.000095 grad: 0.1054 (0.0954) loss: 0.8063 (0.8104) time: 0.2715 data: 0.0002 max mem: 26157 Train: [35] [4600/6250] eta: 0:07:36 lr: 0.000095 grad: 0.0894 (0.0954) loss: 0.8127 (0.8103) time: 0.2698 data: 0.0002 max mem: 26157 Train: [35] [4700/6250] eta: 0:07:09 lr: 0.000095 grad: 0.0943 (0.0954) loss: 0.8064 (0.8103) time: 0.2686 data: 0.0002 max mem: 26157 Train: [35] [4800/6250] eta: 0:06:42 lr: 0.000095 grad: 0.0945 (0.0954) loss: 0.8092 (0.8102) time: 0.5043 data: 0.2308 max mem: 26157 Train: [35] [4900/6250] eta: 0:06:14 lr: 0.000095 grad: 0.0945 (0.0956) loss: 0.8059 (0.8102) time: 0.2694 data: 0.0002 max mem: 26157 Train: [35] [5000/6250] eta: 0:05:46 lr: 0.000095 grad: 0.0926 (0.0955) loss: 0.8074 (0.8102) time: 0.2695 data: 0.0002 max mem: 26157 Train: [35] [5100/6250] eta: 0:05:18 lr: 0.000095 grad: 0.0917 (0.0955) loss: 0.8174 (0.8102) time: 0.2704 data: 0.0002 max mem: 26157 Train: [35] [5200/6250] eta: 0:04:50 lr: 0.000095 grad: 0.0873 (0.0955) loss: 0.8123 (0.8102) time: 0.2732 data: 0.0002 max mem: 26157 Train: [35] [5300/6250] eta: 0:04:23 lr: 0.000095 grad: 0.0921 (0.0955) loss: 0.8109 (0.8102) time: 0.2685 data: 0.0002 max mem: 26157 Train: [35] [5400/6250] eta: 0:03:55 lr: 0.000095 grad: 0.0938 (0.0955) loss: 0.8100 (0.8102) time: 0.2710 data: 0.0002 max mem: 26157 Train: [35] [5500/6250] eta: 0:03:28 lr: 0.000095 grad: 0.0889 (0.0955) loss: 0.8144 (0.8102) time: 0.6104 data: 0.3354 max mem: 26157 Train: [35] [5600/6250] eta: 0:03:00 lr: 0.000095 grad: 0.0895 (0.0956) loss: 0.8146 (0.8102) time: 0.2693 data: 0.0002 max mem: 26157 Train: [35] [5700/6250] eta: 0:02:32 lr: 0.000095 grad: 0.0838 (0.0956) loss: 0.8116 (0.8101) time: 0.2718 data: 0.0002 max mem: 26157 Train: [35] [5800/6250] eta: 0:02:05 lr: 0.000095 grad: 0.0929 (0.0955) loss: 0.8128 (0.8101) time: 0.2696 data: 0.0002 max mem: 26157 Train: [35] [5900/6250] eta: 0:01:37 lr: 0.000095 grad: 0.0896 (0.0955) loss: 0.8119 (0.8101) time: 0.2699 data: 0.0002 max mem: 26157 Train: [35] [6000/6250] eta: 0:01:09 lr: 0.000095 grad: 0.0928 (0.0954) loss: 0.8096 (0.8101) time: 0.2704 data: 0.0002 max mem: 26157 Train: [35] [6100/6250] eta: 0:00:41 lr: 0.000095 grad: 0.0859 (0.0954) loss: 0.8138 (0.8101) time: 0.2684 data: 0.0002 max mem: 26157 Train: [35] [6200/6250] eta: 0:00:13 lr: 0.000095 grad: 0.0955 (0.0954) loss: 0.8074 (0.8101) time: 0.2689 data: 0.0002 max mem: 26157 Train: [35] [6249/6250] eta: 0:00:00 lr: 0.000095 grad: 0.0937 (0.0954) loss: 0.8186 (0.8101) time: 0.2696 data: 0.0002 max mem: 26157 Train: [35] Total time: 0:29:00 (0.2785 s / it) Averaged stats: lr: 0.000095 grad: 0.0937 (0.0954) loss: 0.8186 (0.8101) Eval (hcp-train-subset): [35] [ 0/62] eta: 0:03:56 loss: 0.8401 (0.8401) time: 3.8113 data: 3.6975 max mem: 26157 Eval (hcp-train-subset): [35] [61/62] eta: 0:00:00 loss: 0.8277 (0.8288) time: 0.1243 data: 0.0413 max mem: 26157 Eval (hcp-train-subset): [35] Total time: 0:00:13 (0.2101 s / it) Averaged stats (hcp-train-subset): loss: 0.8277 (0.8288) Making plots (hcp-train-subset): example=35 Eval (hcp-val): [35] [ 0/62] eta: 0:04:55 loss: 0.8248 (0.8248) time: 4.7674 data: 4.6816 max mem: 26157 Eval (hcp-val): [35] [61/62] eta: 0:00:00 loss: 0.8273 (0.8278) time: 0.1305 data: 0.0479 max mem: 26157 Eval (hcp-val): [35] Total time: 0:00:13 (0.2162 s / it) Averaged stats (hcp-val): loss: 0.8273 (0.8278) Making plots (hcp-val): example=35 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [36] [ 0/6250] eta: 10:33:47 lr: 0.000095 grad: 0.0796 (0.0796) loss: 0.8487 (0.8487) time: 6.0844 data: 5.7880 max mem: 26157 Train: [36] [ 100/6250] eta: 0:33:36 lr: 0.000095 grad: 0.0963 (0.1008) loss: 0.8198 (0.8282) time: 0.2687 data: 0.0001 max mem: 26157 Train: [36] [ 200/6250] eta: 0:30:09 lr: 0.000095 grad: 0.0983 (0.1061) loss: 0.8123 (0.8219) time: 0.2698 data: 0.0002 max mem: 26157 Train: [36] [ 300/6250] eta: 0:28:40 lr: 0.000095 grad: 0.0922 (0.1043) loss: 0.8032 (0.8181) time: 0.2694 data: 0.0002 max mem: 26157 Train: [36] [ 400/6250] eta: 0:27:43 lr: 0.000095 grad: 0.0929 (0.1021) loss: 0.8072 (0.8162) time: 0.2720 data: 0.0002 max mem: 26157 Train: [36] [ 500/6250] eta: 0:26:58 lr: 0.000095 grad: 0.0905 (0.1002) loss: 0.8135 (0.8151) time: 0.2698 data: 0.0002 max mem: 26157 Train: [36] [ 600/6250] eta: 0:26:21 lr: 0.000095 grad: 0.0876 (0.0986) loss: 0.8166 (0.8148) time: 0.2700 data: 0.0002 max mem: 26157 Train: [36] [ 700/6250] eta: 0:25:46 lr: 0.000095 grad: 0.0861 (0.0974) loss: 0.8129 (0.8145) time: 0.2695 data: 0.0002 max mem: 26157 Train: [36] [ 800/6250] eta: 0:25:13 lr: 0.000095 grad: 0.0902 (0.0966) loss: 0.8084 (0.8143) time: 0.2726 data: 0.0002 max mem: 26157 Train: [36] [ 900/6250] eta: 0:24:54 lr: 0.000095 grad: 0.0903 (0.0960) loss: 0.8190 (0.8145) time: 0.2750 data: 0.0002 max mem: 26157 Train: [36] [1000/6250] eta: 0:24:42 lr: 0.000095 grad: 0.0877 (0.0955) loss: 0.8121 (0.8146) time: 0.2742 data: 0.0003 max mem: 26157 Train: [36] [1100/6250] eta: 0:24:09 lr: 0.000095 grad: 0.0933 (0.0952) loss: 0.8082 (0.8142) time: 0.2695 data: 0.0002 max mem: 26157 Train: [36] [1200/6250] eta: 0:23:36 lr: 0.000095 grad: 0.0893 (0.0953) loss: 0.8104 (0.8140) time: 0.2708 data: 0.0002 max mem: 26157 Train: [36] [1300/6250] eta: 0:23:04 lr: 0.000095 grad: 0.0894 (0.0951) loss: 0.8141 (0.8138) time: 0.2690 data: 0.0002 max mem: 26157 Train: [36] [1400/6250] eta: 0:22:33 lr: 0.000095 grad: 0.0920 (0.0951) loss: 0.8102 (0.8136) time: 0.2709 data: 0.0002 max mem: 26157 Train: [36] [1500/6250] eta: 0:22:02 lr: 0.000095 grad: 0.0938 (0.0952) loss: 0.8084 (0.8134) time: 0.2683 data: 0.0002 max mem: 26157 Train: [36] [1600/6250] eta: 0:21:32 lr: 0.000094 grad: 0.0960 (0.0951) loss: 0.8104 (0.8132) time: 0.2688 data: 0.0002 max mem: 26157 Train: [36] [1700/6250] eta: 0:21:02 lr: 0.000094 grad: 0.0892 (0.0950) loss: 0.8148 (0.8132) time: 0.2686 data: 0.0002 max mem: 26157 Train: [36] [1800/6250] eta: 0:20:32 lr: 0.000094 grad: 0.0895 (0.0951) loss: 0.8217 (0.8131) time: 0.2698 data: 0.0002 max mem: 26157 Train: [36] [1900/6250] eta: 0:20:03 lr: 0.000094 grad: 0.0935 (0.0949) loss: 0.8153 (0.8130) time: 0.2708 data: 0.0002 max mem: 26157 Train: [36] [2000/6250] eta: 0:19:34 lr: 0.000094 grad: 0.0906 (0.0949) loss: 0.8153 (0.8130) time: 0.2686 data: 0.0002 max mem: 26157 Train: [36] [2100/6250] eta: 0:19:05 lr: 0.000094 grad: 0.0942 (0.0949) loss: 0.8076 (0.8129) time: 0.2710 data: 0.0002 max mem: 26157 Train: [36] [2200/6250] eta: 0:18:36 lr: 0.000094 grad: 0.0878 (0.0949) loss: 0.8111 (0.8129) time: 0.2709 data: 0.0002 max mem: 26157 Train: [36] [2300/6250] eta: 0:18:15 lr: 0.000094 grad: 0.0904 (0.0948) loss: 0.8156 (0.8129) time: 0.4894 data: 0.2189 max mem: 26157 Train: [36] [2400/6250] eta: 0:17:46 lr: 0.000094 grad: 0.0968 (0.0948) loss: 0.8137 (0.8129) time: 0.2694 data: 0.0002 max mem: 26157 Train: [36] [2500/6250] eta: 0:17:17 lr: 0.000094 grad: 0.0912 (0.0947) loss: 0.8171 (0.8129) time: 0.2691 data: 0.0002 max mem: 26157 Train: [36] [2600/6250] eta: 0:17:03 lr: 0.000094 grad: 0.0912 (0.0947) loss: 0.8136 (0.8129) time: 0.3798 data: 0.1021 max mem: 26157 Train: [36] [2700/6250] eta: 0:16:40 lr: 0.000094 grad: 0.0881 (0.0947) loss: 0.8135 (0.8129) time: 0.2708 data: 0.0002 max mem: 26157 Train: [36] [2800/6250] eta: 0:16:10 lr: 0.000094 grad: 0.0933 (0.0947) loss: 0.8150 (0.8129) time: 0.2727 data: 0.0002 max mem: 26157 Train: [36] [2900/6250] eta: 0:15:41 lr: 0.000094 grad: 0.0987 (0.0949) loss: 0.8110 (0.8129) time: 0.2739 data: 0.0003 max mem: 26157 Train: [36] [3000/6250] eta: 0:15:12 lr: 0.000094 grad: 0.0932 (0.0949) loss: 0.8099 (0.8128) time: 0.2723 data: 0.0002 max mem: 26157 Train: [36] [3100/6250] eta: 0:14:44 lr: 0.000094 grad: 0.0980 (0.0949) loss: 0.8080 (0.8127) time: 0.2689 data: 0.0002 max mem: 26157 Train: [36] [3200/6250] eta: 0:14:15 lr: 0.000094 grad: 0.0926 (0.0949) loss: 0.8048 (0.8127) time: 0.2701 data: 0.0002 max mem: 26157 Train: [36] [3300/6250] eta: 0:13:46 lr: 0.000094 grad: 0.0934 (0.0949) loss: 0.8087 (0.8127) time: 0.2694 data: 0.0002 max mem: 26157 Train: [36] [3400/6250] eta: 0:13:17 lr: 0.000094 grad: 0.0913 (0.0950) loss: 0.8144 (0.8126) time: 0.2693 data: 0.0002 max mem: 26157 Train: [36] [3500/6250] eta: 0:12:48 lr: 0.000094 grad: 0.0922 (0.0951) loss: 0.8124 (0.8126) time: 0.2685 data: 0.0002 max mem: 26157 Train: [36] [3600/6250] eta: 0:12:19 lr: 0.000094 grad: 0.0934 (0.0951) loss: 0.8054 (0.8125) time: 0.2684 data: 0.0002 max mem: 26157 Train: [36] [3700/6250] eta: 0:11:51 lr: 0.000094 grad: 0.0920 (0.0952) loss: 0.8151 (0.8125) time: 0.2718 data: 0.0002 max mem: 26157 Train: [36] [3800/6250] eta: 0:11:22 lr: 0.000094 grad: 0.0990 (0.0953) loss: 0.8124 (0.8124) time: 0.2698 data: 0.0002 max mem: 26157 Train: [36] [3900/6250] eta: 0:10:54 lr: 0.000094 grad: 0.0955 (0.0953) loss: 0.8071 (0.8124) time: 0.2699 data: 0.0002 max mem: 26157 Train: [36] [4000/6250] eta: 0:10:26 lr: 0.000094 grad: 0.0914 (0.0953) loss: 0.8156 (0.8124) time: 0.2687 data: 0.0002 max mem: 26157 Train: [36] [4100/6250] eta: 0:09:57 lr: 0.000094 grad: 0.0941 (0.0954) loss: 0.8167 (0.8124) time: 0.2694 data: 0.0002 max mem: 26157 Train: [36] [4200/6250] eta: 0:09:29 lr: 0.000094 grad: 0.0931 (0.0954) loss: 0.8182 (0.8124) time: 0.2704 data: 0.0002 max mem: 26157 Train: [36] [4300/6250] eta: 0:09:01 lr: 0.000094 grad: 0.0926 (0.0955) loss: 0.8193 (0.8124) time: 0.2702 data: 0.0002 max mem: 26157 Train: [36] [4400/6250] eta: 0:08:33 lr: 0.000094 grad: 0.0931 (0.0955) loss: 0.8071 (0.8123) time: 0.2698 data: 0.0002 max mem: 26157 Train: [36] [4500/6250] eta: 0:08:05 lr: 0.000094 grad: 0.0939 (0.0955) loss: 0.8147 (0.8123) time: 0.2716 data: 0.0002 max mem: 26157 Train: [36] [4600/6250] eta: 0:07:37 lr: 0.000094 grad: 0.0917 (0.0955) loss: 0.8135 (0.8123) time: 0.2711 data: 0.0002 max mem: 26157 Train: [36] [4700/6250] eta: 0:07:09 lr: 0.000094 grad: 0.0980 (0.0956) loss: 0.8100 (0.8123) time: 0.2684 data: 0.0002 max mem: 26157 Train: [36] [4800/6250] eta: 0:06:41 lr: 0.000094 grad: 0.0890 (0.0956) loss: 0.8129 (0.8123) time: 0.2686 data: 0.0002 max mem: 26157 Train: [36] [4900/6250] eta: 0:06:13 lr: 0.000094 grad: 0.0988 (0.0958) loss: 0.8163 (0.8123) time: 0.2723 data: 0.0003 max mem: 26157 Train: [36] [5000/6250] eta: 0:05:45 lr: 0.000094 grad: 0.0898 (0.0958) loss: 0.8170 (0.8123) time: 0.2693 data: 0.0002 max mem: 26157 Train: [36] [5100/6250] eta: 0:05:18 lr: 0.000093 grad: 0.0999 (0.0958) loss: 0.8073 (0.8123) time: 0.2731 data: 0.0002 max mem: 26157 Train: [36] [5200/6250] eta: 0:04:50 lr: 0.000093 grad: 0.0894 (0.0959) loss: 0.8117 (0.8122) time: 0.2688 data: 0.0002 max mem: 26157 Train: [36] [5300/6250] eta: 0:04:22 lr: 0.000093 grad: 0.0956 (0.0958) loss: 0.8122 (0.8121) time: 0.2697 data: 0.0002 max mem: 26157 Train: [36] [5400/6250] eta: 0:03:54 lr: 0.000093 grad: 0.0899 (0.0959) loss: 0.8076 (0.8121) time: 0.2689 data: 0.0001 max mem: 26157 Train: [36] [5500/6250] eta: 0:03:27 lr: 0.000093 grad: 0.0963 (0.0959) loss: 0.8059 (0.8121) time: 0.2714 data: 0.0002 max mem: 26157 Train: [36] [5600/6250] eta: 0:02:59 lr: 0.000093 grad: 0.0915 (0.0959) loss: 0.8140 (0.8120) time: 0.2837 data: 0.0002 max mem: 26157 Train: [36] [5700/6250] eta: 0:02:33 lr: 0.000093 grad: 0.0900 (0.0959) loss: 0.8124 (0.8120) time: 0.8372 data: 0.5623 max mem: 26157 Train: [36] [5800/6250] eta: 0:02:05 lr: 0.000093 grad: 0.0935 (0.0959) loss: 0.8152 (0.8120) time: 0.2689 data: 0.0002 max mem: 26157 Train: [36] [5900/6250] eta: 0:01:37 lr: 0.000093 grad: 0.0930 (0.0959) loss: 0.8100 (0.8120) time: 0.2682 data: 0.0002 max mem: 26157 Train: [36] [6000/6250] eta: 0:01:09 lr: 0.000093 grad: 0.0946 (0.0959) loss: 0.8168 (0.8120) time: 0.2687 data: 0.0001 max mem: 26157 Train: [36] [6100/6250] eta: 0:00:41 lr: 0.000093 grad: 0.0940 (0.0959) loss: 0.8150 (0.8120) time: 0.2712 data: 0.0002 max mem: 26157 Train: [36] [6200/6250] eta: 0:00:13 lr: 0.000093 grad: 0.0911 (0.0960) loss: 0.8121 (0.8120) time: 0.2709 data: 0.0002 max mem: 26157 Train: [36] [6249/6250] eta: 0:00:00 lr: 0.000093 grad: 0.0972 (0.0960) loss: 0.8114 (0.8120) time: 0.2708 data: 0.0002 max mem: 26157 Train: [36] Total time: 0:29:05 (0.2792 s / it) Averaged stats: lr: 0.000093 grad: 0.0972 (0.0960) loss: 0.8114 (0.8120) Eval (hcp-train-subset): [36] [ 0/62] eta: 0:03:56 loss: 0.8359 (0.8359) time: 3.8095 data: 3.6922 max mem: 26157 Eval (hcp-train-subset): [36] [61/62] eta: 0:00:00 loss: 0.8276 (0.8291) time: 0.1415 data: 0.0568 max mem: 26157 Eval (hcp-train-subset): [36] Total time: 0:00:13 (0.2141 s / it) Averaged stats (hcp-train-subset): loss: 0.8276 (0.8291) Making plots (hcp-train-subset): example=57 Eval (hcp-val): [36] [ 0/62] eta: 0:03:27 loss: 0.8247 (0.8247) time: 3.3464 data: 3.2523 max mem: 26157 Eval (hcp-val): [36] [61/62] eta: 0:00:00 loss: 0.8268 (0.8273) time: 0.1446 data: 0.0610 max mem: 26157 Eval (hcp-val): [36] Total time: 0:00:12 (0.2086 s / it) Averaged stats (hcp-val): loss: 0.8268 (0.8273) Making plots (hcp-val): example=6 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [37] [ 0/6250] eta: 8:59:59 lr: 0.000093 grad: 0.1205 (0.1205) loss: 0.8328 (0.8328) time: 5.1838 data: 4.8421 max mem: 26157 Train: [37] [ 100/6250] eta: 0:33:20 lr: 0.000093 grad: 0.1005 (0.1123) loss: 0.8199 (0.8260) time: 0.2711 data: 0.0003 max mem: 26157 Train: [37] [ 200/6250] eta: 0:30:03 lr: 0.000093 grad: 0.0977 (0.1112) loss: 0.8105 (0.8201) time: 0.2715 data: 0.0002 max mem: 26157 Train: [37] [ 300/6250] eta: 0:28:40 lr: 0.000093 grad: 0.0857 (0.1050) loss: 0.8230 (0.8179) time: 0.2702 data: 0.0002 max mem: 26157 Train: [37] [ 400/6250] eta: 0:27:42 lr: 0.000093 grad: 0.0904 (0.1011) loss: 0.8205 (0.8176) time: 0.2693 data: 0.0002 max mem: 26157 Train: [37] [ 500/6250] eta: 0:26:57 lr: 0.000093 grad: 0.0844 (0.0992) loss: 0.8159 (0.8173) time: 0.2705 data: 0.0002 max mem: 26157 Train: [37] [ 600/6250] eta: 0:26:17 lr: 0.000093 grad: 0.0908 (0.0974) loss: 0.8103 (0.8170) time: 0.2694 data: 0.0002 max mem: 26157 Train: [37] [ 700/6250] eta: 0:25:43 lr: 0.000093 grad: 0.0842 (0.0963) loss: 0.8138 (0.8165) time: 0.2729 data: 0.0002 max mem: 26157 Train: [37] [ 800/6250] eta: 0:25:12 lr: 0.000093 grad: 0.0857 (0.0956) loss: 0.8152 (0.8161) time: 0.2721 data: 0.0002 max mem: 26157 Train: [37] [ 900/6250] eta: 0:24:40 lr: 0.000093 grad: 0.0847 (0.0949) loss: 0.8227 (0.8160) time: 0.2699 data: 0.0002 max mem: 26157 Train: [37] [1000/6250] eta: 0:24:09 lr: 0.000093 grad: 0.0910 (0.0944) loss: 0.8070 (0.8160) time: 0.2702 data: 0.0002 max mem: 26157 Train: [37] [1100/6250] eta: 0:23:39 lr: 0.000093 grad: 0.0860 (0.0941) loss: 0.8135 (0.8156) time: 0.2706 data: 0.0002 max mem: 26157 Train: [37] [1200/6250] eta: 0:23:09 lr: 0.000093 grad: 0.0863 (0.0939) loss: 0.8090 (0.8154) time: 0.2694 data: 0.0002 max mem: 26157 Train: [37] [1300/6250] eta: 0:22:39 lr: 0.000093 grad: 0.0915 (0.0940) loss: 0.8083 (0.8150) time: 0.2685 data: 0.0002 max mem: 26157 Train: [37] [1400/6250] eta: 0:22:10 lr: 0.000093 grad: 0.0949 (0.0942) loss: 0.8122 (0.8146) time: 0.2704 data: 0.0002 max mem: 26157 Train: [37] [1500/6250] eta: 0:21:41 lr: 0.000093 grad: 0.0933 (0.0944) loss: 0.8058 (0.8142) time: 0.2693 data: 0.0002 max mem: 26157 Train: [37] [1600/6250] eta: 0:21:12 lr: 0.000093 grad: 0.0974 (0.0947) loss: 0.8088 (0.8138) time: 0.2703 data: 0.0002 max mem: 26157 Train: [37] [1700/6250] eta: 0:20:44 lr: 0.000093 grad: 0.0877 (0.0947) loss: 0.8108 (0.8135) time: 0.2708 data: 0.0002 max mem: 26157 Train: [37] [1800/6250] eta: 0:20:16 lr: 0.000093 grad: 0.0936 (0.0952) loss: 0.8007 (0.8132) time: 0.2697 data: 0.0002 max mem: 26157 Train: [37] [1900/6250] eta: 0:19:48 lr: 0.000093 grad: 0.0939 (0.0953) loss: 0.8076 (0.8130) time: 0.2721 data: 0.0002 max mem: 26157 Train: [37] [2000/6250] eta: 0:19:20 lr: 0.000093 grad: 0.0910 (0.0953) loss: 0.8052 (0.8128) time: 0.2740 data: 0.0002 max mem: 26157 Train: [37] [2100/6250] eta: 0:19:13 lr: 0.000093 grad: 0.0946 (0.0953) loss: 0.8144 (0.8127) time: 0.3424 data: 0.0689 max mem: 26157 Train: [37] [2200/6250] eta: 0:18:47 lr: 0.000093 grad: 0.1008 (0.0955) loss: 0.8073 (0.8124) time: 0.2693 data: 0.0002 max mem: 26157 Train: [37] [2300/6250] eta: 0:18:30 lr: 0.000092 grad: 0.0922 (0.0955) loss: 0.8115 (0.8121) time: 0.2735 data: 0.0002 max mem: 26157 Train: [37] [2400/6250] eta: 0:18:00 lr: 0.000092 grad: 0.0910 (0.0958) loss: 0.7988 (0.8119) time: 0.2689 data: 0.0002 max mem: 26157 Train: [37] [2500/6250] eta: 0:17:30 lr: 0.000092 grad: 0.0971 (0.0960) loss: 0.8062 (0.8116) time: 0.2690 data: 0.0002 max mem: 26157 Train: [37] [2600/6250] eta: 0:17:01 lr: 0.000092 grad: 0.0970 (0.0964) loss: 0.8048 (0.8114) time: 0.2680 data: 0.0002 max mem: 26157 Train: [37] [2700/6250] eta: 0:16:31 lr: 0.000092 grad: 0.0966 (0.0966) loss: 0.8051 (0.8110) time: 0.2693 data: 0.0002 max mem: 26157 Train: [37] [2800/6250] eta: 0:16:02 lr: 0.000092 grad: 0.1069 (0.0969) loss: 0.8012 (0.8107) time: 0.2686 data: 0.0002 max mem: 26157 Train: [37] [2900/6250] eta: 0:15:33 lr: 0.000092 grad: 0.1005 (0.0971) loss: 0.8037 (0.8104) time: 0.2703 data: 0.0002 max mem: 26157 Train: [37] [3000/6250] eta: 0:15:04 lr: 0.000092 grad: 0.1004 (0.0972) loss: 0.8010 (0.8102) time: 0.2690 data: 0.0002 max mem: 26157 Train: [37] [3100/6250] eta: 0:14:35 lr: 0.000092 grad: 0.0969 (0.0973) loss: 0.8069 (0.8100) time: 0.2683 data: 0.0002 max mem: 26157 Train: [37] [3200/6250] eta: 0:14:07 lr: 0.000092 grad: 0.0959 (0.0975) loss: 0.7987 (0.8098) time: 0.2699 data: 0.0002 max mem: 26157 Train: [37] [3300/6250] eta: 0:13:38 lr: 0.000092 grad: 0.0964 (0.0975) loss: 0.8038 (0.8096) time: 0.2699 data: 0.0002 max mem: 26157 Train: [37] [3400/6250] eta: 0:13:10 lr: 0.000092 grad: 0.0945 (0.0976) loss: 0.8074 (0.8094) time: 0.2688 data: 0.0002 max mem: 26157 Train: [37] [3500/6250] eta: 0:12:43 lr: 0.000092 grad: 0.0934 (0.0976) loss: 0.8087 (0.8094) time: 0.2714 data: 0.0002 max mem: 26157 Train: [37] [3600/6250] eta: 0:12:17 lr: 0.000092 grad: 0.0995 (0.0977) loss: 0.8034 (0.8092) time: 0.4632 data: 0.1918 max mem: 26157 Train: [37] [3700/6250] eta: 0:11:49 lr: 0.000092 grad: 0.0970 (0.0979) loss: 0.8048 (0.8090) time: 0.2699 data: 0.0002 max mem: 26157 Train: [37] [3800/6250] eta: 0:11:20 lr: 0.000092 grad: 0.1055 (0.0980) loss: 0.8069 (0.8089) time: 0.2709 data: 0.0002 max mem: 26157 Train: [37] [3900/6250] eta: 0:10:57 lr: 0.000092 grad: 0.1040 (0.0981) loss: 0.8070 (0.8088) time: 0.2718 data: 0.0002 max mem: 26157 Train: [37] [4000/6250] eta: 0:10:28 lr: 0.000092 grad: 0.0973 (0.0983) loss: 0.8020 (0.8087) time: 0.2691 data: 0.0002 max mem: 26157 Train: [37] [4100/6250] eta: 0:10:02 lr: 0.000092 grad: 0.0961 (0.0984) loss: 0.8079 (0.8085) time: 0.4771 data: 0.2030 max mem: 26157 Train: [37] [4200/6250] eta: 0:09:33 lr: 0.000092 grad: 0.0985 (0.0985) loss: 0.8097 (0.8084) time: 0.2687 data: 0.0002 max mem: 26157 Train: [37] [4300/6250] eta: 0:09:05 lr: 0.000092 grad: 0.0979 (0.0984) loss: 0.8093 (0.8084) time: 0.2962 data: 0.0117 max mem: 26157 Train: [37] [4400/6250] eta: 0:08:38 lr: 0.000092 grad: 0.0982 (0.0984) loss: 0.8099 (0.8084) time: 0.2699 data: 0.0002 max mem: 26157 Train: [37] [4500/6250] eta: 0:08:10 lr: 0.000092 grad: 0.0893 (0.0983) loss: 0.8037 (0.8084) time: 0.2689 data: 0.0002 max mem: 26157 Train: [37] [4600/6250] eta: 0:07:41 lr: 0.000092 grad: 0.0925 (0.0983) loss: 0.8055 (0.8084) time: 0.2689 data: 0.0002 max mem: 26157 Train: [37] [4700/6250] eta: 0:07:13 lr: 0.000092 grad: 0.0963 (0.0983) loss: 0.8084 (0.8083) time: 0.2690 data: 0.0002 max mem: 26157 Train: [37] [4800/6250] eta: 0:06:45 lr: 0.000092 grad: 0.0921 (0.0982) loss: 0.8122 (0.8084) time: 0.2687 data: 0.0002 max mem: 26157 Train: [37] [4900/6250] eta: 0:06:17 lr: 0.000092 grad: 0.0936 (0.0982) loss: 0.8170 (0.8085) time: 0.2692 data: 0.0002 max mem: 26157 Train: [37] [5000/6250] eta: 0:05:48 lr: 0.000092 grad: 0.0946 (0.0981) loss: 0.8157 (0.8085) time: 0.2696 data: 0.0002 max mem: 26157 Train: [37] [5100/6250] eta: 0:05:20 lr: 0.000092 grad: 0.0912 (0.0980) loss: 0.8136 (0.8086) time: 0.2699 data: 0.0002 max mem: 26157 Train: [37] [5200/6250] eta: 0:04:52 lr: 0.000092 grad: 0.0915 (0.0980) loss: 0.8186 (0.8086) time: 0.2730 data: 0.0002 max mem: 26157 Train: [37] [5300/6250] eta: 0:04:24 lr: 0.000092 grad: 0.0949 (0.0980) loss: 0.8143 (0.8086) time: 0.2685 data: 0.0002 max mem: 26157 Train: [37] [5400/6250] eta: 0:03:56 lr: 0.000092 grad: 0.0924 (0.0979) loss: 0.8077 (0.8087) time: 0.2701 data: 0.0002 max mem: 26157 Train: [37] [5500/6250] eta: 0:03:28 lr: 0.000092 grad: 0.0934 (0.0979) loss: 0.8125 (0.8087) time: 0.2689 data: 0.0002 max mem: 26157 Train: [37] [5600/6250] eta: 0:03:00 lr: 0.000092 grad: 0.0932 (0.0979) loss: 0.8149 (0.8087) time: 0.2691 data: 0.0001 max mem: 26157 Train: [37] [5700/6250] eta: 0:02:32 lr: 0.000091 grad: 0.0960 (0.0979) loss: 0.8149 (0.8088) time: 0.2678 data: 0.0002 max mem: 26157 Train: [37] [5800/6250] eta: 0:02:04 lr: 0.000091 grad: 0.1034 (0.0980) loss: 0.8131 (0.8088) time: 0.2685 data: 0.0002 max mem: 26157 Train: [37] [5900/6250] eta: 0:01:37 lr: 0.000091 grad: 0.0992 (0.0981) loss: 0.8051 (0.8088) time: 0.2715 data: 0.0002 max mem: 26157 Train: [37] [6000/6250] eta: 0:01:09 lr: 0.000091 grad: 0.0992 (0.0982) loss: 0.8110 (0.8087) time: 0.2708 data: 0.0002 max mem: 26157 Train: [37] [6100/6250] eta: 0:00:41 lr: 0.000091 grad: 0.0977 (0.0982) loss: 0.8079 (0.8087) time: 0.2690 data: 0.0002 max mem: 26157 Train: [37] [6200/6250] eta: 0:00:13 lr: 0.000091 grad: 0.0940 (0.0982) loss: 0.8163 (0.8087) time: 0.2687 data: 0.0002 max mem: 26157 Train: [37] [6249/6250] eta: 0:00:00 lr: 0.000091 grad: 0.0978 (0.0983) loss: 0.8036 (0.8087) time: 0.2692 data: 0.0002 max mem: 26157 Train: [37] Total time: 0:28:57 (0.2781 s / it) Averaged stats: lr: 0.000091 grad: 0.0978 (0.0983) loss: 0.8036 (0.8087) Eval (hcp-train-subset): [37] [ 0/62] eta: 0:04:41 loss: 0.8373 (0.8373) time: 4.5386 data: 4.4546 max mem: 26157 Eval (hcp-train-subset): [37] [61/62] eta: 0:00:00 loss: 0.8289 (0.8285) time: 0.1185 data: 0.0336 max mem: 26157 Eval (hcp-train-subset): [37] Total time: 0:00:12 (0.2050 s / it) Averaged stats (hcp-train-subset): loss: 0.8289 (0.8285) Making plots (hcp-train-subset): example=59 Eval (hcp-val): [37] [ 0/62] eta: 0:03:45 loss: 0.8237 (0.8237) time: 3.6301 data: 3.4898 max mem: 26157 Eval (hcp-val): [37] [61/62] eta: 0:00:00 loss: 0.8243 (0.8273) time: 0.1096 data: 0.0270 max mem: 26157 Eval (hcp-val): [37] Total time: 0:00:12 (0.2057 s / it) Averaged stats (hcp-val): loss: 0.8243 (0.8273) Making plots (hcp-val): example=29 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [38] [ 0/6250] eta: 7:06:26 lr: 0.000091 grad: 0.0593 (0.0593) loss: 0.8689 (0.8689) time: 4.0938 data: 3.7435 max mem: 26157 Train: [38] [ 100/6250] eta: 0:33:19 lr: 0.000091 grad: 0.1059 (0.1139) loss: 0.8010 (0.8142) time: 0.2690 data: 0.0001 max mem: 26157 Train: [38] [ 200/6250] eta: 0:30:05 lr: 0.000091 grad: 0.1029 (0.1070) loss: 0.8146 (0.8127) time: 0.2717 data: 0.0002 max mem: 26157 Train: [38] [ 300/6250] eta: 0:29:29 lr: 0.000091 grad: 0.0988 (0.1063) loss: 0.8120 (0.8115) time: 0.3695 data: 0.0814 max mem: 26157 Train: [38] [ 400/6250] eta: 0:28:34 lr: 0.000091 grad: 0.0983 (0.1049) loss: 0.8107 (0.8117) time: 0.3139 data: 0.0400 max mem: 26157 Train: [38] [ 500/6250] eta: 0:28:06 lr: 0.000091 grad: 0.0921 (0.1036) loss: 0.8163 (0.8126) time: 0.3818 data: 0.0986 max mem: 26157 Train: [38] [ 600/6250] eta: 0:27:15 lr: 0.000091 grad: 0.0909 (0.1026) loss: 0.8221 (0.8129) time: 0.2697 data: 0.0002 max mem: 26157 Train: [38] [ 700/6250] eta: 0:26:52 lr: 0.000091 grad: 0.0884 (0.1010) loss: 0.8106 (0.8129) time: 0.3929 data: 0.1183 max mem: 26157 Train: [38] [ 800/6250] eta: 0:26:28 lr: 0.000091 grad: 0.0877 (0.1004) loss: 0.8172 (0.8128) time: 0.3946 data: 0.1068 max mem: 26157 Train: [38] [ 900/6250] eta: 0:26:00 lr: 0.000091 grad: 0.0910 (0.0997) loss: 0.8127 (0.8127) time: 0.2734 data: 0.0003 max mem: 26157 Train: [38] [1000/6250] eta: 0:25:21 lr: 0.000091 grad: 0.0910 (0.0992) loss: 0.8149 (0.8127) time: 0.2733 data: 0.0002 max mem: 26157 Train: [38] [1100/6250] eta: 0:25:07 lr: 0.000091 grad: 0.0974 (0.0986) loss: 0.8001 (0.8124) time: 0.2724 data: 0.0002 max mem: 26157 Train: [38] [1200/6250] eta: 0:24:29 lr: 0.000091 grad: 0.0968 (0.0983) loss: 0.8092 (0.8124) time: 0.2700 data: 0.0002 max mem: 26157 Train: [38] [1300/6250] eta: 0:24:06 lr: 0.000091 grad: 0.0992 (0.0982) loss: 0.8104 (0.8123) time: 0.2697 data: 0.0001 max mem: 26157 Train: [38] [1400/6250] eta: 0:23:30 lr: 0.000091 grad: 0.0931 (0.0978) loss: 0.8148 (0.8123) time: 0.2705 data: 0.0002 max mem: 26157 Train: [38] [1500/6250] eta: 0:22:54 lr: 0.000091 grad: 0.0932 (0.0978) loss: 0.8109 (0.8122) time: 0.2693 data: 0.0002 max mem: 26157 Train: [38] [1600/6250] eta: 0:22:19 lr: 0.000091 grad: 0.0954 (0.0978) loss: 0.8121 (0.8120) time: 0.2703 data: 0.0002 max mem: 26157 Train: [38] [1700/6250] eta: 0:21:46 lr: 0.000091 grad: 0.0950 (0.0977) loss: 0.8043 (0.8118) time: 0.2703 data: 0.0002 max mem: 26157 Train: [38] [1800/6250] eta: 0:21:13 lr: 0.000091 grad: 0.0955 (0.0977) loss: 0.8095 (0.8117) time: 0.2690 data: 0.0002 max mem: 26157 Train: [38] [1900/6250] eta: 0:20:41 lr: 0.000091 grad: 0.0974 (0.0977) loss: 0.8061 (0.8116) time: 0.2695 data: 0.0002 max mem: 26157 Train: [38] [2000/6250] eta: 0:20:09 lr: 0.000091 grad: 0.1018 (0.0978) loss: 0.8073 (0.8114) time: 0.2781 data: 0.0002 max mem: 26157 Train: [38] [2100/6250] eta: 0:19:38 lr: 0.000091 grad: 0.0967 (0.0980) loss: 0.8065 (0.8113) time: 0.2707 data: 0.0003 max mem: 26157 Train: [38] [2200/6250] eta: 0:19:07 lr: 0.000091 grad: 0.0955 (0.0980) loss: 0.8093 (0.8113) time: 0.2690 data: 0.0002 max mem: 26157 Train: [38] [2300/6250] eta: 0:18:41 lr: 0.000091 grad: 0.0981 (0.0981) loss: 0.8057 (0.8111) time: 0.2701 data: 0.0002 max mem: 26157 Train: [38] [2400/6250] eta: 0:18:10 lr: 0.000091 grad: 0.0962 (0.0982) loss: 0.8101 (0.8110) time: 0.2677 data: 0.0002 max mem: 26157 Train: [38] [2500/6250] eta: 0:17:39 lr: 0.000091 grad: 0.0992 (0.0983) loss: 0.8043 (0.8109) time: 0.2685 data: 0.0002 max mem: 26157 Train: [38] [2600/6250] eta: 0:17:09 lr: 0.000091 grad: 0.0977 (0.0983) loss: 0.8083 (0.8109) time: 0.2680 data: 0.0002 max mem: 26157 Train: [38] [2700/6250] eta: 0:16:40 lr: 0.000091 grad: 0.0947 (0.0984) loss: 0.8056 (0.8107) time: 0.2681 data: 0.0002 max mem: 26157 Train: [38] [2800/6250] eta: 0:16:11 lr: 0.000091 grad: 0.0924 (0.0984) loss: 0.8030 (0.8106) time: 0.2686 data: 0.0001 max mem: 26157 Train: [38] [2900/6250] eta: 0:15:42 lr: 0.000090 grad: 0.1003 (0.0984) loss: 0.8024 (0.8104) time: 0.2755 data: 0.0002 max mem: 26157 Train: [38] [3000/6250] eta: 0:15:13 lr: 0.000090 grad: 0.0980 (0.0985) loss: 0.8065 (0.8103) time: 0.2693 data: 0.0001 max mem: 26157 Train: [38] [3100/6250] eta: 0:14:44 lr: 0.000090 grad: 0.0948 (0.0987) loss: 0.8052 (0.8102) time: 0.2686 data: 0.0002 max mem: 26157 Train: [38] [3200/6250] eta: 0:14:14 lr: 0.000090 grad: 0.0989 (0.0987) loss: 0.8043 (0.8100) time: 0.2687 data: 0.0002 max mem: 26157 Train: [38] [3300/6250] eta: 0:13:45 lr: 0.000090 grad: 0.1060 (0.0988) loss: 0.7974 (0.8098) time: 0.2681 data: 0.0001 max mem: 26157 Train: [38] [3400/6250] eta: 0:13:16 lr: 0.000090 grad: 0.0957 (0.0988) loss: 0.8033 (0.8098) time: 0.2678 data: 0.0002 max mem: 26157 Train: [38] [3500/6250] eta: 0:12:48 lr: 0.000090 grad: 0.1002 (0.0990) loss: 0.8043 (0.8096) time: 0.2693 data: 0.0002 max mem: 26157 Train: [38] [3600/6250] eta: 0:12:19 lr: 0.000090 grad: 0.1017 (0.0991) loss: 0.7972 (0.8095) time: 0.2695 data: 0.0002 max mem: 26157 Train: [38] [3700/6250] eta: 0:11:50 lr: 0.000090 grad: 0.1010 (0.0991) loss: 0.8028 (0.8094) time: 0.2687 data: 0.0002 max mem: 26157 Train: [38] [3800/6250] eta: 0:11:22 lr: 0.000090 grad: 0.1022 (0.0992) loss: 0.8008 (0.8092) time: 0.2703 data: 0.0002 max mem: 26157 Train: [38] [3900/6250] eta: 0:10:54 lr: 0.000090 grad: 0.0950 (0.0992) loss: 0.7997 (0.8091) time: 0.2696 data: 0.0002 max mem: 26157 Train: [38] [4000/6250] eta: 0:10:25 lr: 0.000090 grad: 0.1020 (0.0993) loss: 0.8005 (0.8090) time: 0.2698 data: 0.0002 max mem: 26157 Train: [38] [4100/6250] eta: 0:09:57 lr: 0.000090 grad: 0.0974 (0.0993) loss: 0.8054 (0.8088) time: 0.2683 data: 0.0001 max mem: 26157 Train: [38] [4200/6250] eta: 0:09:29 lr: 0.000090 grad: 0.0938 (0.0993) loss: 0.8079 (0.8087) time: 0.2688 data: 0.0002 max mem: 26157 Train: [38] [4300/6250] eta: 0:09:01 lr: 0.000090 grad: 0.0962 (0.0993) loss: 0.8054 (0.8086) time: 0.2680 data: 0.0002 max mem: 26157 Train: [38] [4400/6250] eta: 0:08:33 lr: 0.000090 grad: 0.0964 (0.0994) loss: 0.8054 (0.8086) time: 0.2713 data: 0.0002 max mem: 26157 Train: [38] [4500/6250] eta: 0:08:05 lr: 0.000090 grad: 0.1016 (0.0996) loss: 0.8038 (0.8085) time: 0.2687 data: 0.0002 max mem: 26157 Train: [38] [4600/6250] eta: 0:07:37 lr: 0.000090 grad: 0.0977 (0.0996) loss: 0.8082 (0.8085) time: 0.2684 data: 0.0002 max mem: 26157 Train: [38] [4700/6250] eta: 0:07:09 lr: 0.000090 grad: 0.0942 (0.0996) loss: 0.8120 (0.8085) time: 0.2687 data: 0.0002 max mem: 26157 Train: [38] [4800/6250] eta: 0:06:41 lr: 0.000090 grad: 0.0909 (0.0997) loss: 0.8135 (0.8085) time: 0.2679 data: 0.0002 max mem: 26157 Train: [38] [4900/6250] eta: 0:06:13 lr: 0.000090 grad: 0.1014 (0.0997) loss: 0.8150 (0.8085) time: 0.2685 data: 0.0002 max mem: 26157 Train: [38] [5000/6250] eta: 0:05:45 lr: 0.000090 grad: 0.0918 (0.0997) loss: 0.8100 (0.8085) time: 0.2681 data: 0.0002 max mem: 26157 Train: [38] [5100/6250] eta: 0:05:17 lr: 0.000090 grad: 0.1018 (0.0997) loss: 0.8058 (0.8085) time: 0.2698 data: 0.0002 max mem: 26157 Train: [38] [5200/6250] eta: 0:04:49 lr: 0.000090 grad: 0.0956 (0.0997) loss: 0.8107 (0.8085) time: 0.2693 data: 0.0002 max mem: 26157 Train: [38] [5300/6250] eta: 0:04:22 lr: 0.000090 grad: 0.0968 (0.0997) loss: 0.8097 (0.8085) time: 0.2723 data: 0.0002 max mem: 26157 Train: [38] [5400/6250] eta: 0:03:54 lr: 0.000090 grad: 0.0921 (0.0996) loss: 0.8095 (0.8085) time: 0.2728 data: 0.0002 max mem: 26157 Train: [38] [5500/6250] eta: 0:03:26 lr: 0.000090 grad: 0.0986 (0.0997) loss: 0.8036 (0.8085) time: 0.2700 data: 0.0002 max mem: 26157 Train: [38] [5600/6250] eta: 0:02:59 lr: 0.000090 grad: 0.1005 (0.0998) loss: 0.8086 (0.8084) time: 0.2701 data: 0.0002 max mem: 26157 Train: [38] [5700/6250] eta: 0:02:31 lr: 0.000090 grad: 0.1104 (0.0998) loss: 0.7977 (0.8083) time: 0.2686 data: 0.0002 max mem: 26157 Train: [38] [5800/6250] eta: 0:02:03 lr: 0.000090 grad: 0.0988 (0.0999) loss: 0.8044 (0.8083) time: 0.2690 data: 0.0002 max mem: 26157 Train: [38] [5900/6250] eta: 0:01:36 lr: 0.000090 grad: 0.0926 (0.1000) loss: 0.8100 (0.8081) time: 0.2683 data: 0.0002 max mem: 26157 Train: [38] [6000/6250] eta: 0:01:08 lr: 0.000090 grad: 0.0977 (0.1001) loss: 0.8054 (0.8080) time: 0.2690 data: 0.0002 max mem: 26157 Train: [38] [6100/6250] eta: 0:00:41 lr: 0.000090 grad: 0.1071 (0.1002) loss: 0.8011 (0.8080) time: 0.2691 data: 0.0002 max mem: 26157 Train: [38] [6200/6250] eta: 0:00:13 lr: 0.000089 grad: 0.1031 (0.1003) loss: 0.8002 (0.8078) time: 0.2701 data: 0.0002 max mem: 26157 Train: [38] [6249/6250] eta: 0:00:00 lr: 0.000089 grad: 0.0980 (0.1003) loss: 0.8066 (0.8078) time: 0.2694 data: 0.0002 max mem: 26157 Train: [38] Total time: 0:28:44 (0.2760 s / it) Averaged stats: lr: 0.000089 grad: 0.0980 (0.1003) loss: 0.8066 (0.8078) Eval (hcp-train-subset): [38] [ 0/62] eta: 0:03:19 loss: 0.8383 (0.8383) time: 3.2142 data: 3.0904 max mem: 26157 Eval (hcp-train-subset): [38] [61/62] eta: 0:00:00 loss: 0.8278 (0.8294) time: 0.1384 data: 0.0554 max mem: 26157 Eval (hcp-train-subset): [38] Total time: 0:00:13 (0.2167 s / it) Averaged stats (hcp-train-subset): loss: 0.8278 (0.8294) Making plots (hcp-train-subset): example=35 Eval (hcp-val): [38] [ 0/62] eta: 0:05:27 loss: 0.8241 (0.8241) time: 5.2855 data: 5.1949 max mem: 26157 Eval (hcp-val): [38] [61/62] eta: 0:00:00 loss: 0.8274 (0.8275) time: 0.1305 data: 0.0453 max mem: 26157 Eval (hcp-val): [38] Total time: 0:00:14 (0.2261 s / it) Averaged stats (hcp-val): loss: 0.8274 (0.8275) Making plots (hcp-val): example=62 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [39] [ 0/6250] eta: 8:08:53 lr: 0.000089 grad: 0.0687 (0.0687) loss: 0.8752 (0.8752) time: 4.6933 data: 4.3485 max mem: 26157 Train: [39] [ 100/6250] eta: 0:33:28 lr: 0.000089 grad: 0.0999 (0.1212) loss: 0.8222 (0.8229) time: 0.2691 data: 0.0002 max mem: 26157 Train: [39] [ 200/6250] eta: 0:30:08 lr: 0.000089 grad: 0.0962 (0.1112) loss: 0.8205 (0.8200) time: 0.2732 data: 0.0002 max mem: 26157 Train: [39] [ 300/6250] eta: 0:28:45 lr: 0.000089 grad: 0.1015 (0.1078) loss: 0.8103 (0.8196) time: 0.2735 data: 0.0002 max mem: 26157 Train: [39] [ 400/6250] eta: 0:27:48 lr: 0.000089 grad: 0.0949 (0.1059) loss: 0.8151 (0.8183) time: 0.2697 data: 0.0002 max mem: 26157 Train: [39] [ 500/6250] eta: 0:27:02 lr: 0.000089 grad: 0.0971 (0.1046) loss: 0.8157 (0.8168) time: 0.2706 data: 0.0002 max mem: 26157 Train: [39] [ 600/6250] eta: 0:26:21 lr: 0.000089 grad: 0.0937 (0.1035) loss: 0.8137 (0.8161) time: 0.2690 data: 0.0002 max mem: 26157 Train: [39] [ 700/6250] eta: 0:25:46 lr: 0.000089 grad: 0.0980 (0.1025) loss: 0.8126 (0.8158) time: 0.2691 data: 0.0002 max mem: 26157 Train: [39] [ 800/6250] eta: 0:25:12 lr: 0.000089 grad: 0.0950 (0.1024) loss: 0.8083 (0.8149) time: 0.2714 data: 0.0002 max mem: 26157 Train: [39] [ 900/6250] eta: 0:24:40 lr: 0.000089 grad: 0.0941 (0.1025) loss: 0.8170 (0.8146) time: 0.2691 data: 0.0002 max mem: 26157 Train: [39] [1000/6250] eta: 0:24:09 lr: 0.000089 grad: 0.0925 (0.1018) loss: 0.8097 (0.8143) time: 0.2712 data: 0.0002 max mem: 26157 Train: [39] [1100/6250] eta: 0:23:39 lr: 0.000089 grad: 0.0933 (0.1012) loss: 0.8160 (0.8140) time: 0.2706 data: 0.0002 max mem: 26157 Train: [39] [1200/6250] eta: 0:23:09 lr: 0.000089 grad: 0.0872 (0.1009) loss: 0.8139 (0.8135) time: 0.2710 data: 0.0002 max mem: 26157 Train: [39] [1300/6250] eta: 0:22:40 lr: 0.000089 grad: 0.1045 (0.1009) loss: 0.8062 (0.8130) time: 0.2698 data: 0.0002 max mem: 26157 Train: [39] [1400/6250] eta: 0:22:11 lr: 0.000089 grad: 0.0966 (0.1011) loss: 0.8067 (0.8125) time: 0.2712 data: 0.0002 max mem: 26157 Train: [39] [1500/6250] eta: 0:21:45 lr: 0.000089 grad: 0.0925 (0.1012) loss: 0.8058 (0.8123) time: 0.2694 data: 0.0002 max mem: 26157 Train: [39] [1600/6250] eta: 0:21:16 lr: 0.000089 grad: 0.0968 (0.1012) loss: 0.8049 (0.8117) time: 0.2695 data: 0.0002 max mem: 26157 Train: [39] [1700/6250] eta: 0:21:02 lr: 0.000089 grad: 0.0932 (0.1013) loss: 0.8054 (0.8112) time: 0.2708 data: 0.0002 max mem: 26157 Train: [39] [1800/6250] eta: 0:20:32 lr: 0.000089 grad: 0.0992 (0.1014) loss: 0.8011 (0.8107) time: 0.2692 data: 0.0002 max mem: 26157 Train: [39] [1900/6250] eta: 0:20:09 lr: 0.000089 grad: 0.1078 (0.1015) loss: 0.8046 (0.8104) time: 0.2697 data: 0.0002 max mem: 26157 Train: [39] [2000/6250] eta: 0:19:39 lr: 0.000089 grad: 0.1019 (0.1017) loss: 0.8029 (0.8100) time: 0.2699 data: 0.0002 max mem: 26157 Train: [39] [2100/6250] eta: 0:19:10 lr: 0.000089 grad: 0.1044 (0.1020) loss: 0.8034 (0.8095) time: 0.2725 data: 0.0002 max mem: 26157 Train: [39] [2200/6250] eta: 0:18:41 lr: 0.000089 grad: 0.0962 (0.1021) loss: 0.8006 (0.8090) time: 0.2695 data: 0.0002 max mem: 26157 Train: [39] [2300/6250] eta: 0:18:12 lr: 0.000089 grad: 0.1042 (0.1022) loss: 0.8000 (0.8086) time: 0.2691 data: 0.0002 max mem: 26157 Train: [39] [2400/6250] eta: 0:17:43 lr: 0.000089 grad: 0.1041 (0.1024) loss: 0.8059 (0.8083) time: 0.2710 data: 0.0002 max mem: 26157 Train: [39] [2500/6250] eta: 0:17:20 lr: 0.000089 grad: 0.0970 (0.1025) loss: 0.8048 (0.8081) time: 0.2692 data: 0.0002 max mem: 26157 Train: [39] [2600/6250] eta: 0:16:51 lr: 0.000089 grad: 0.1115 (0.1025) loss: 0.8003 (0.8078) time: 0.2688 data: 0.0002 max mem: 26157 Train: [39] [2700/6250] eta: 0:16:22 lr: 0.000089 grad: 0.1010 (0.1027) loss: 0.7994 (0.8076) time: 0.2701 data: 0.0002 max mem: 26157 Train: [39] [2800/6250] eta: 0:15:54 lr: 0.000089 grad: 0.1046 (0.1028) loss: 0.8080 (0.8074) time: 0.2693 data: 0.0002 max mem: 26157 Train: [39] [2900/6250] eta: 0:15:25 lr: 0.000089 grad: 0.0955 (0.1028) loss: 0.8110 (0.8073) time: 0.2690 data: 0.0001 max mem: 26157 Train: [39] [3000/6250] eta: 0:14:57 lr: 0.000089 grad: 0.1055 (0.1028) loss: 0.7971 (0.8072) time: 0.2686 data: 0.0002 max mem: 26157 Train: [39] [3100/6250] eta: 0:14:29 lr: 0.000089 grad: 0.1031 (0.1031) loss: 0.8032 (0.8072) time: 0.2691 data: 0.0002 max mem: 26157 Train: [39] [3200/6250] eta: 0:14:01 lr: 0.000089 grad: 0.1000 (0.1032) loss: 0.8026 (0.8071) time: 0.2712 data: 0.0002 max mem: 26157 Train: [39] [3300/6250] eta: 0:13:32 lr: 0.000088 grad: 0.1050 (0.1033) loss: 0.8048 (0.8070) time: 0.2673 data: 0.0001 max mem: 26157 Train: [39] [3400/6250] eta: 0:13:04 lr: 0.000088 grad: 0.1005 (0.1034) loss: 0.8068 (0.8070) time: 0.2699 data: 0.0002 max mem: 26157 Train: [39] [3500/6250] eta: 0:12:36 lr: 0.000088 grad: 0.1045 (0.1035) loss: 0.8017 (0.8069) time: 0.2744 data: 0.0003 max mem: 26157 Train: [39] [3600/6250] eta: 0:12:09 lr: 0.000088 grad: 0.0935 (0.1034) loss: 0.8040 (0.8070) time: 0.2701 data: 0.0002 max mem: 26157 Train: [39] [3700/6250] eta: 0:11:41 lr: 0.000088 grad: 0.0929 (0.1033) loss: 0.8056 (0.8070) time: 0.2691 data: 0.0002 max mem: 26157 Train: [39] [3800/6250] eta: 0:11:13 lr: 0.000088 grad: 0.1034 (0.1034) loss: 0.8033 (0.8069) time: 0.2685 data: 0.0002 max mem: 26157 Train: [39] [3900/6250] eta: 0:10:45 lr: 0.000088 grad: 0.1014 (0.1034) loss: 0.8080 (0.8070) time: 0.2692 data: 0.0002 max mem: 26157 Train: [39] [4000/6250] eta: 0:10:20 lr: 0.000088 grad: 0.1016 (0.1035) loss: 0.8044 (0.8070) time: 0.2695 data: 0.0002 max mem: 26157 Train: [39] [4100/6250] eta: 0:09:52 lr: 0.000088 grad: 0.1026 (0.1035) loss: 0.8096 (0.8070) time: 0.2704 data: 0.0002 max mem: 26157 Train: [39] [4200/6250] eta: 0:09:28 lr: 0.000088 grad: 0.0986 (0.1035) loss: 0.8070 (0.8070) time: 0.6547 data: 0.3766 max mem: 26157 Train: [39] [4300/6250] eta: 0:09:00 lr: 0.000088 grad: 0.0999 (0.1035) loss: 0.8055 (0.8070) time: 0.2681 data: 0.0002 max mem: 26157 Train: [39] [4400/6250] eta: 0:08:32 lr: 0.000088 grad: 0.1031 (0.1035) loss: 0.8090 (0.8070) time: 0.2694 data: 0.0002 max mem: 26157 Train: [39] [4500/6250] eta: 0:08:04 lr: 0.000088 grad: 0.0998 (0.1035) loss: 0.8066 (0.8071) time: 0.2687 data: 0.0002 max mem: 26157 Train: [39] [4600/6250] eta: 0:07:36 lr: 0.000088 grad: 0.1026 (0.1035) loss: 0.8038 (0.8071) time: 0.2691 data: 0.0002 max mem: 26157 Train: [39] [4700/6250] eta: 0:07:08 lr: 0.000088 grad: 0.0991 (0.1036) loss: 0.8038 (0.8071) time: 0.2714 data: 0.0002 max mem: 26157 Train: [39] [4800/6250] eta: 0:06:40 lr: 0.000088 grad: 0.1033 (0.1036) loss: 0.8135 (0.8072) time: 0.2687 data: 0.0002 max mem: 26157 Train: [39] [4900/6250] eta: 0:06:12 lr: 0.000088 grad: 0.0989 (0.1037) loss: 0.8162 (0.8072) time: 0.2693 data: 0.0002 max mem: 26157 Train: [39] [5000/6250] eta: 0:05:45 lr: 0.000088 grad: 0.0958 (0.1036) loss: 0.8141 (0.8073) time: 0.2684 data: 0.0002 max mem: 26157 Train: [39] [5100/6250] eta: 0:05:17 lr: 0.000088 grad: 0.1045 (0.1036) loss: 0.8086 (0.8074) time: 0.3019 data: 0.0302 max mem: 26157 Train: [39] [5200/6250] eta: 0:04:49 lr: 0.000088 grad: 0.1006 (0.1035) loss: 0.8156 (0.8075) time: 0.2722 data: 0.0003 max mem: 26157 Train: [39] [5300/6250] eta: 0:04:23 lr: 0.000088 grad: 0.0960 (0.1035) loss: 0.8203 (0.8076) time: 0.2715 data: 0.0002 max mem: 26157 Train: [39] [5400/6250] eta: 0:03:55 lr: 0.000088 grad: 0.0974 (0.1034) loss: 0.8081 (0.8076) time: 0.2690 data: 0.0001 max mem: 26157 Train: [39] [5500/6250] eta: 0:03:27 lr: 0.000088 grad: 0.0993 (0.1035) loss: 0.8101 (0.8077) time: 0.2684 data: 0.0001 max mem: 26157 Train: [39] [5600/6250] eta: 0:02:59 lr: 0.000088 grad: 0.1047 (0.1035) loss: 0.8123 (0.8078) time: 0.2688 data: 0.0002 max mem: 26157 Train: [39] [5700/6250] eta: 0:02:31 lr: 0.000088 grad: 0.1012 (0.1035) loss: 0.8118 (0.8078) time: 0.2710 data: 0.0002 max mem: 26157 Train: [39] [5800/6250] eta: 0:02:04 lr: 0.000088 grad: 0.0941 (0.1034) loss: 0.8077 (0.8078) time: 0.2684 data: 0.0001 max mem: 26157 Train: [39] [5900/6250] eta: 0:01:36 lr: 0.000088 grad: 0.0970 (0.1034) loss: 0.8064 (0.8079) time: 0.2693 data: 0.0002 max mem: 26157 Train: [39] [6000/6250] eta: 0:01:08 lr: 0.000088 grad: 0.0946 (0.1034) loss: 0.8135 (0.8079) time: 0.2695 data: 0.0002 max mem: 26157 Train: [39] [6100/6250] eta: 0:00:41 lr: 0.000088 grad: 0.0967 (0.1034) loss: 0.8073 (0.8079) time: 0.2690 data: 0.0002 max mem: 26157 Train: [39] [6200/6250] eta: 0:00:13 lr: 0.000088 grad: 0.0964 (0.1034) loss: 0.8145 (0.8080) time: 0.2710 data: 0.0002 max mem: 26157 Train: [39] [6249/6250] eta: 0:00:00 lr: 0.000088 grad: 0.0982 (0.1034) loss: 0.8093 (0.8080) time: 0.2685 data: 0.0002 max mem: 26157 Train: [39] Total time: 0:28:49 (0.2766 s / it) Averaged stats: lr: 0.000088 grad: 0.0982 (0.1034) loss: 0.8093 (0.8080) Eval (hcp-train-subset): [39] [ 0/62] eta: 0:04:06 loss: 0.8406 (0.8406) time: 3.9739 data: 3.8735 max mem: 26157 Eval (hcp-train-subset): [39] [61/62] eta: 0:00:00 loss: 0.8278 (0.8295) time: 0.1413 data: 0.0564 max mem: 26157 Eval (hcp-train-subset): [39] Total time: 0:00:12 (0.2092 s / it) Averaged stats (hcp-train-subset): loss: 0.8278 (0.8295) Making plots (hcp-train-subset): example=47 Eval (hcp-val): [39] [ 0/62] eta: 0:05:41 loss: 0.8248 (0.8248) time: 5.5078 data: 5.4230 max mem: 26157 Eval (hcp-val): [39] [61/62] eta: 0:00:00 loss: 0.8254 (0.8272) time: 0.1181 data: 0.0350 max mem: 26157 Eval (hcp-val): [39] Total time: 0:00:12 (0.2024 s / it) Averaged stats (hcp-val): loss: 0.8254 (0.8272) Making plots (hcp-val): example=36 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [40] [ 0/6250] eta: 10:48:10 lr: 0.000088 grad: 0.1008 (0.1008) loss: 0.8240 (0.8240) time: 6.2224 data: 5.9428 max mem: 26157 Train: [40] [ 100/6250] eta: 0:34:24 lr: 0.000088 grad: 0.1096 (0.1158) loss: 0.8047 (0.8133) time: 0.2703 data: 0.0002 max mem: 26157 Train: [40] [ 200/6250] eta: 0:30:38 lr: 0.000088 grad: 0.1047 (0.1113) loss: 0.8120 (0.8090) time: 0.2732 data: 0.0002 max mem: 26157 Train: [40] [ 300/6250] eta: 0:29:03 lr: 0.000088 grad: 0.1049 (0.1094) loss: 0.8010 (0.8067) time: 0.2724 data: 0.0002 max mem: 26157 Train: [40] [ 400/6250] eta: 0:28:02 lr: 0.000087 grad: 0.0955 (0.1069) loss: 0.8045 (0.8062) time: 0.2679 data: 0.0003 max mem: 26157 Train: [40] [ 500/6250] eta: 0:27:14 lr: 0.000087 grad: 0.0970 (0.1054) loss: 0.8106 (0.8067) time: 0.2700 data: 0.0002 max mem: 26157 Train: [40] [ 600/6250] eta: 0:26:32 lr: 0.000087 grad: 0.0980 (0.1048) loss: 0.8097 (0.8071) time: 0.2697 data: 0.0002 max mem: 26157 Train: [40] [ 700/6250] eta: 0:25:55 lr: 0.000087 grad: 0.0927 (0.1045) loss: 0.8212 (0.8077) time: 0.2722 data: 0.0003 max mem: 26157 Train: [40] [ 800/6250] eta: 0:25:22 lr: 0.000087 grad: 0.1007 (0.1042) loss: 0.8085 (0.8076) time: 0.2692 data: 0.0002 max mem: 26157 Train: [40] [ 900/6250] eta: 0:24:48 lr: 0.000087 grad: 0.1023 (0.1046) loss: 0.8028 (0.8072) time: 0.2695 data: 0.0002 max mem: 26157 Train: [40] [1000/6250] eta: 0:24:17 lr: 0.000087 grad: 0.1001 (0.1047) loss: 0.8098 (0.8070) time: 0.2738 data: 0.0002 max mem: 26157 Train: [40] [1100/6250] eta: 0:23:49 lr: 0.000087 grad: 0.0995 (0.1047) loss: 0.8094 (0.8068) time: 0.2718 data: 0.0002 max mem: 26157 Train: [40] [1200/6250] eta: 0:23:18 lr: 0.000087 grad: 0.0988 (0.1045) loss: 0.8072 (0.8066) time: 0.2701 data: 0.0002 max mem: 26157 Train: [40] [1300/6250] eta: 0:22:48 lr: 0.000087 grad: 0.1035 (0.1045) loss: 0.8029 (0.8063) time: 0.2698 data: 0.0003 max mem: 26157 Train: [40] [1400/6250] eta: 0:22:30 lr: 0.000087 grad: 0.1002 (0.1043) loss: 0.7975 (0.8061) time: 0.4139 data: 0.1395 max mem: 26157 Train: [40] [1500/6250] eta: 0:21:59 lr: 0.000087 grad: 0.0953 (0.1042) loss: 0.8063 (0.8059) time: 0.2685 data: 0.0002 max mem: 26157 Train: [40] [1600/6250] eta: 0:21:30 lr: 0.000087 grad: 0.0950 (0.1041) loss: 0.8085 (0.8059) time: 0.2733 data: 0.0002 max mem: 26157 Train: [40] [1700/6250] eta: 0:21:07 lr: 0.000087 grad: 0.1037 (0.1039) loss: 0.7993 (0.8057) time: 0.2684 data: 0.0002 max mem: 26157 Train: [40] [1800/6250] eta: 0:20:37 lr: 0.000087 grad: 0.0994 (0.1040) loss: 0.7936 (0.8054) time: 0.2714 data: 0.0002 max mem: 26157 Train: [40] [1900/6250] eta: 0:20:22 lr: 0.000087 grad: 0.1056 (0.1039) loss: 0.7999 (0.8052) time: 0.2702 data: 0.0002 max mem: 26157 Train: [40] [2000/6250] eta: 0:19:52 lr: 0.000087 grad: 0.1040 (0.1038) loss: 0.8057 (0.8051) time: 0.2690 data: 0.0002 max mem: 26157 Train: [40] [2100/6250] eta: 0:19:22 lr: 0.000087 grad: 0.0989 (0.1037) loss: 0.8034 (0.8050) time: 0.2681 data: 0.0001 max mem: 26157 Train: [40] [2200/6250] eta: 0:18:52 lr: 0.000087 grad: 0.1032 (0.1038) loss: 0.7971 (0.8048) time: 0.2688 data: 0.0002 max mem: 26157 Train: [40] [2300/6250] eta: 0:18:22 lr: 0.000087 grad: 0.1023 (0.1038) loss: 0.7956 (0.8047) time: 0.2712 data: 0.0002 max mem: 26157 Train: [40] [2400/6250] eta: 0:17:53 lr: 0.000087 grad: 0.1038 (0.1039) loss: 0.8004 (0.8045) time: 0.2704 data: 0.0002 max mem: 26157 Train: [40] [2500/6250] eta: 0:17:24 lr: 0.000087 grad: 0.1040 (0.1041) loss: 0.7984 (0.8043) time: 0.2693 data: 0.0002 max mem: 26157 Train: [40] [2600/6250] eta: 0:16:55 lr: 0.000087 grad: 0.1051 (0.1045) loss: 0.7952 (0.8041) time: 0.2704 data: 0.0002 max mem: 26157 Train: [40] [2700/6250] eta: 0:16:26 lr: 0.000087 grad: 0.1101 (0.1045) loss: 0.8012 (0.8039) time: 0.2718 data: 0.0002 max mem: 26157 Train: [40] [2800/6250] eta: 0:15:57 lr: 0.000087 grad: 0.1012 (0.1047) loss: 0.8039 (0.8038) time: 0.2700 data: 0.0002 max mem: 26157 Train: [40] [2900/6250] eta: 0:15:29 lr: 0.000087 grad: 0.1011 (0.1049) loss: 0.8025 (0.8037) time: 0.2692 data: 0.0002 max mem: 26157 Train: [40] [3000/6250] eta: 0:15:00 lr: 0.000087 grad: 0.1031 (0.1051) loss: 0.7974 (0.8035) time: 0.2714 data: 0.0002 max mem: 26157 Train: [40] [3100/6250] eta: 0:14:32 lr: 0.000087 grad: 0.1005 (0.1051) loss: 0.8013 (0.8034) time: 0.2710 data: 0.0002 max mem: 26157 Train: [40] [3200/6250] eta: 0:14:05 lr: 0.000087 grad: 0.1061 (0.1052) loss: 0.7945 (0.8033) time: 0.2698 data: 0.0002 max mem: 26157 Train: [40] [3300/6250] eta: 0:13:38 lr: 0.000087 grad: 0.1105 (0.1052) loss: 0.8034 (0.8032) time: 0.2711 data: 0.0002 max mem: 26157 Train: [40] [3400/6250] eta: 0:13:10 lr: 0.000087 grad: 0.1033 (0.1052) loss: 0.8112 (0.8032) time: 0.2759 data: 0.0002 max mem: 26157 Train: [40] [3500/6250] eta: 0:12:47 lr: 0.000087 grad: 0.1064 (0.1052) loss: 0.8006 (0.8032) time: 0.2688 data: 0.0002 max mem: 26157 Train: [40] [3600/6250] eta: 0:12:18 lr: 0.000087 grad: 0.0989 (0.1052) loss: 0.8084 (0.8032) time: 0.2695 data: 0.0002 max mem: 26157 Train: [40] [3700/6250] eta: 0:11:51 lr: 0.000086 grad: 0.0987 (0.1052) loss: 0.8012 (0.8032) time: 0.3867 data: 0.1151 max mem: 26157 Train: [40] [3800/6250] eta: 0:11:23 lr: 0.000086 grad: 0.1075 (0.1051) loss: 0.8064 (0.8033) time: 0.2694 data: 0.0002 max mem: 26157 Train: [40] [3900/6250] eta: 0:10:54 lr: 0.000086 grad: 0.1013 (0.1051) loss: 0.8018 (0.8034) time: 0.2704 data: 0.0002 max mem: 26157 Train: [40] [4000/6250] eta: 0:10:28 lr: 0.000086 grad: 0.1008 (0.1050) loss: 0.8085 (0.8035) time: 0.4487 data: 0.1732 max mem: 26157 Train: [40] [4100/6250] eta: 0:09:59 lr: 0.000086 grad: 0.1048 (0.1050) loss: 0.8071 (0.8036) time: 0.2684 data: 0.0002 max mem: 26157 Train: [40] [4200/6250] eta: 0:09:31 lr: 0.000086 grad: 0.1022 (0.1051) loss: 0.8027 (0.8037) time: 0.2683 data: 0.0002 max mem: 26157 Train: [40] [4300/6250] eta: 0:09:03 lr: 0.000086 grad: 0.1098 (0.1051) loss: 0.7996 (0.8037) time: 0.2680 data: 0.0002 max mem: 26157 Train: [40] [4400/6250] eta: 0:08:35 lr: 0.000086 grad: 0.1060 (0.1051) loss: 0.7992 (0.8038) time: 0.2755 data: 0.0002 max mem: 26157 Train: [40] [4500/6250] eta: 0:08:07 lr: 0.000086 grad: 0.1091 (0.1052) loss: 0.8068 (0.8039) time: 0.2730 data: 0.0003 max mem: 26157 Train: [40] [4600/6250] eta: 0:07:39 lr: 0.000086 grad: 0.1076 (0.1053) loss: 0.8045 (0.8040) time: 0.2683 data: 0.0002 max mem: 26157 Train: [40] [4700/6250] eta: 0:07:11 lr: 0.000086 grad: 0.1036 (0.1052) loss: 0.7998 (0.8041) time: 0.2719 data: 0.0002 max mem: 26157 Train: [40] [4800/6250] eta: 0:06:43 lr: 0.000086 grad: 0.1003 (0.1051) loss: 0.8135 (0.8043) time: 0.2703 data: 0.0002 max mem: 26157 Train: [40] [4900/6250] eta: 0:06:15 lr: 0.000086 grad: 0.1045 (0.1051) loss: 0.8120 (0.8043) time: 0.2718 data: 0.0002 max mem: 26157 Train: [40] [5000/6250] eta: 0:05:47 lr: 0.000086 grad: 0.1072 (0.1053) loss: 0.8006 (0.8044) time: 0.2705 data: 0.0002 max mem: 26157 Train: [40] [5100/6250] eta: 0:05:19 lr: 0.000086 grad: 0.0932 (0.1053) loss: 0.8105 (0.8044) time: 0.2689 data: 0.0002 max mem: 26157 Train: [40] [5200/6250] eta: 0:04:51 lr: 0.000086 grad: 0.1088 (0.1053) loss: 0.8061 (0.8045) time: 0.2707 data: 0.0002 max mem: 26157 Train: [40] [5300/6250] eta: 0:04:23 lr: 0.000086 grad: 0.1046 (0.1052) loss: 0.8025 (0.8045) time: 0.2700 data: 0.0002 max mem: 26157 Train: [40] [5400/6250] eta: 0:03:56 lr: 0.000086 grad: 0.1038 (0.1052) loss: 0.8125 (0.8046) time: 0.2733 data: 0.0002 max mem: 26157 Train: [40] [5500/6250] eta: 0:03:29 lr: 0.000086 grad: 0.1013 (0.1051) loss: 0.8043 (0.8047) time: 0.2713 data: 0.0002 max mem: 26157 Train: [40] [5600/6250] eta: 0:03:01 lr: 0.000086 grad: 0.0988 (0.1051) loss: 0.8117 (0.8047) time: 0.2679 data: 0.0002 max mem: 26157 Train: [40] [5700/6250] eta: 0:02:33 lr: 0.000086 grad: 0.1063 (0.1050) loss: 0.8047 (0.8048) time: 0.2718 data: 0.0002 max mem: 26157 Train: [40] [5800/6250] eta: 0:02:05 lr: 0.000086 grad: 0.1057 (0.1051) loss: 0.8022 (0.8048) time: 0.2691 data: 0.0002 max mem: 26157 Train: [40] [5900/6250] eta: 0:01:37 lr: 0.000086 grad: 0.1010 (0.1050) loss: 0.8147 (0.8048) time: 0.2701 data: 0.0002 max mem: 26157 Train: [40] [6000/6250] eta: 0:01:09 lr: 0.000086 grad: 0.1031 (0.1050) loss: 0.8022 (0.8048) time: 0.2684 data: 0.0002 max mem: 26157 Train: [40] [6100/6250] eta: 0:00:41 lr: 0.000086 grad: 0.1028 (0.1050) loss: 0.8083 (0.8049) time: 0.2676 data: 0.0002 max mem: 26157 Train: [40] [6200/6250] eta: 0:00:13 lr: 0.000086 grad: 0.0988 (0.1049) loss: 0.8086 (0.8049) time: 0.2680 data: 0.0001 max mem: 26157 Train: [40] [6249/6250] eta: 0:00:00 lr: 0.000086 grad: 0.1027 (0.1049) loss: 0.8044 (0.8049) time: 0.2685 data: 0.0002 max mem: 26157 Train: [40] Total time: 0:29:07 (0.2796 s / it) Averaged stats: lr: 0.000086 grad: 0.1027 (0.1049) loss: 0.8044 (0.8049) Eval (hcp-train-subset): [40] [ 0/62] eta: 0:03:44 loss: 0.8433 (0.8433) time: 3.6229 data: 3.5128 max mem: 26157 Eval (hcp-train-subset): [40] [61/62] eta: 0:00:00 loss: 0.8234 (0.8269) time: 0.1132 data: 0.0303 max mem: 26157 Eval (hcp-train-subset): [40] Total time: 0:00:13 (0.2185 s / it) Averaged stats (hcp-train-subset): loss: 0.8234 (0.8269) Making plots (hcp-train-subset): example=33 Eval (hcp-val): [40] [ 0/62] eta: 0:05:00 loss: 0.8219 (0.8219) time: 4.8496 data: 4.7502 max mem: 26157 Eval (hcp-val): [40] [61/62] eta: 0:00:00 loss: 0.8254 (0.8263) time: 0.1302 data: 0.0456 max mem: 26157 Eval (hcp-val): [40] Total time: 0:00:13 (0.2235 s / it) Averaged stats (hcp-val): loss: 0.8254 (0.8263) Making plots (hcp-val): example=59 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [41] [ 0/6250] eta: 12:15:30 lr: 0.000086 grad: nan (nan) loss: 0.8215 (0.8215) time: 7.0608 data: 6.7901 max mem: 26157 Train: [41] [ 100/6250] eta: 0:34:44 lr: 0.000086 grad: 0.0992 (0.1171) loss: 0.8166 (0.8234) time: 0.2725 data: 0.0002 max mem: 26157 Train: [41] [ 200/6250] eta: 0:30:42 lr: 0.000086 grad: 0.0917 (0.1091) loss: 0.8187 (0.8216) time: 0.2687 data: 0.0002 max mem: 26157 Train: [41] [ 300/6250] eta: 0:29:02 lr: 0.000086 grad: 0.0914 (0.1064) loss: 0.8184 (0.8202) time: 0.2696 data: 0.0002 max mem: 26157 Train: [41] [ 400/6250] eta: 0:27:59 lr: 0.000086 grad: 0.1040 (0.1046) loss: 0.8176 (0.8191) time: 0.2707 data: 0.0002 max mem: 26157 Train: [41] [ 500/6250] eta: 0:27:10 lr: 0.000086 grad: 0.0986 (0.1037) loss: 0.8144 (0.8184) time: 0.2687 data: 0.0002 max mem: 26157 Train: [41] [ 600/6250] eta: 0:26:28 lr: 0.000086 grad: 0.0950 (0.1038) loss: 0.8140 (0.8170) time: 0.2692 data: 0.0001 max mem: 26157 Train: [41] [ 700/6250] eta: 0:25:50 lr: 0.000085 grad: 0.0924 (0.1034) loss: 0.8095 (0.8161) time: 0.2684 data: 0.0002 max mem: 26157 Train: [41] [ 800/6250] eta: 0:25:16 lr: 0.000085 grad: 0.1028 (0.1030) loss: 0.8083 (0.8152) time: 0.2713 data: 0.0002 max mem: 26157 Train: [41] [ 900/6250] eta: 0:24:43 lr: 0.000085 grad: 0.0975 (0.1027) loss: 0.8120 (0.8146) time: 0.2693 data: 0.0002 max mem: 26157 Train: [41] [1000/6250] eta: 0:24:11 lr: 0.000085 grad: 0.1009 (0.1031) loss: 0.8133 (0.8140) time: 0.2692 data: 0.0003 max mem: 26157 Train: [41] [1100/6250] eta: 0:23:40 lr: 0.000085 grad: 0.1006 (0.1032) loss: 0.8064 (0.8134) time: 0.2732 data: 0.0002 max mem: 26157 Train: [41] [1200/6250] eta: 0:23:14 lr: 0.000085 grad: 0.0974 (0.1032) loss: 0.8114 (0.8128) time: 0.2695 data: 0.0002 max mem: 26157 Train: [41] [1300/6250] eta: 0:22:45 lr: 0.000085 grad: 0.0977 (0.1032) loss: 0.8072 (0.8123) time: 0.2728 data: 0.0003 max mem: 26157 Train: [41] [1400/6250] eta: 0:22:15 lr: 0.000085 grad: 0.1011 (0.1032) loss: 0.7970 (0.8118) time: 0.2696 data: 0.0002 max mem: 26157 Train: [41] [1500/6250] eta: 0:22:23 lr: 0.000085 grad: 0.0999 (0.1037) loss: 0.8084 (0.8114) time: 0.4355 data: 0.1616 max mem: 26157 Train: [41] [1600/6250] eta: 0:21:51 lr: 0.000085 grad: 0.0972 (0.1037) loss: 0.8110 (0.8112) time: 0.2686 data: 0.0002 max mem: 26157 Train: [41] [1700/6250] eta: 0:21:19 lr: 0.000085 grad: 0.1022 (0.1037) loss: 0.8130 (0.8111) time: 0.2681 data: 0.0002 max mem: 26157 Train: [41] [1800/6250] eta: 0:20:48 lr: 0.000085 grad: 0.1042 (0.1037) loss: 0.8016 (0.8109) time: 0.2681 data: 0.0002 max mem: 26157 Train: [41] [1900/6250] eta: 0:20:17 lr: 0.000085 grad: 0.1011 (0.1037) loss: 0.8074 (0.8108) time: 0.2693 data: 0.0002 max mem: 26157 Train: [41] [2000/6250] eta: 0:19:47 lr: 0.000085 grad: 0.1080 (0.1038) loss: 0.8110 (0.8107) time: 0.2720 data: 0.0003 max mem: 26157 Train: [41] [2100/6250] eta: 0:19:22 lr: 0.000085 grad: 0.1068 (0.1037) loss: 0.8049 (0.8106) time: 0.2698 data: 0.0002 max mem: 26157 Train: [41] [2200/6250] eta: 0:18:55 lr: 0.000085 grad: 0.1037 (0.1043) loss: 0.8079 (0.8105) time: 0.2733 data: 0.0002 max mem: 26157 Train: [41] [2300/6250] eta: 0:18:27 lr: 0.000085 grad: 0.1021 (0.1045) loss: 0.8097 (0.8103) time: 0.2993 data: 0.0003 max mem: 26157 Train: [41] [2400/6250] eta: 0:18:04 lr: 0.000085 grad: 0.1084 (0.1047) loss: 0.8029 (0.8101) time: 0.2684 data: 0.0002 max mem: 26157 Train: [41] [2500/6250] eta: 0:17:51 lr: 0.000085 grad: 0.1051 (0.1046) loss: 0.8001 (0.8100) time: 0.8263 data: 0.5538 max mem: 26157 Train: [41] [2600/6250] eta: 0:17:20 lr: 0.000085 grad: 0.1016 (0.1046) loss: 0.8055 (0.8098) time: 0.2706 data: 0.0002 max mem: 26157 Train: [41] [2700/6250] eta: 0:16:53 lr: 0.000085 grad: 0.0992 (0.1047) loss: 0.8051 (0.8096) time: 0.2696 data: 0.0002 max mem: 26157 Train: [41] [2800/6250] eta: 0:16:23 lr: 0.000085 grad: 0.1026 (0.1046) loss: 0.7985 (0.8094) time: 0.2681 data: 0.0002 max mem: 26157 Train: [41] [2900/6250] eta: 0:15:53 lr: 0.000085 grad: 0.1119 (0.1047) loss: 0.8022 (0.8092) time: 0.2695 data: 0.0002 max mem: 26157 Train: [41] [3000/6250] eta: 0:15:23 lr: 0.000085 grad: 0.1115 (0.1048) loss: 0.7984 (0.8090) time: 0.2689 data: 0.0002 max mem: 26157 Train: [41] [3100/6250] eta: 0:14:58 lr: 0.000085 grad: 0.1101 (0.1050) loss: 0.7999 (0.8088) time: 0.2706 data: 0.0002 max mem: 26157 Train: [41] [3200/6250] eta: 0:14:28 lr: 0.000085 grad: 0.1030 (0.1050) loss: 0.7994 (0.8086) time: 0.2788 data: 0.0002 max mem: 26157 Train: [41] [3300/6250] eta: 0:13:59 lr: 0.000085 grad: 0.1015 (0.1051) loss: 0.8048 (0.8084) time: 0.2765 data: 0.0002 max mem: 26157 Train: [41] [3400/6250] eta: 0:13:30 lr: 0.000085 grad: 0.1059 (0.1051) loss: 0.8058 (0.8082) time: 0.2695 data: 0.0002 max mem: 26157 Train: [41] [3500/6250] eta: 0:13:08 lr: 0.000085 grad: 0.1047 (0.1052) loss: 0.8084 (0.8080) time: 0.7086 data: 0.4377 max mem: 26157 Train: [41] [3600/6250] eta: 0:12:38 lr: 0.000085 grad: 0.1104 (0.1054) loss: 0.7958 (0.8079) time: 0.2691 data: 0.0002 max mem: 26157 Train: [41] [3700/6250] eta: 0:12:08 lr: 0.000085 grad: 0.1057 (0.1055) loss: 0.7946 (0.8077) time: 0.2687 data: 0.0002 max mem: 26157 Train: [41] [3800/6250] eta: 0:11:38 lr: 0.000085 grad: 0.1056 (0.1056) loss: 0.8015 (0.8074) time: 0.2699 data: 0.0002 max mem: 26157 Train: [41] [3900/6250] eta: 0:11:09 lr: 0.000084 grad: 0.1077 (0.1058) loss: 0.7987 (0.8072) time: 0.2696 data: 0.0002 max mem: 26157 Train: [41] [4000/6250] eta: 0:10:39 lr: 0.000084 grad: 0.1027 (0.1060) loss: 0.7989 (0.8070) time: 0.2699 data: 0.0002 max mem: 26157 Train: [41] [4100/6250] eta: 0:10:10 lr: 0.000084 grad: 0.1061 (0.1063) loss: 0.7993 (0.8068) time: 0.2697 data: 0.0002 max mem: 26157 Train: [41] [4200/6250] eta: 0:09:41 lr: 0.000084 grad: 0.1055 (0.1063) loss: 0.7968 (0.8067) time: 0.2689 data: 0.0002 max mem: 26157 Train: [41] [4300/6250] eta: 0:09:12 lr: 0.000084 grad: 0.0981 (0.1064) loss: 0.8036 (0.8065) time: 0.2685 data: 0.0002 max mem: 26157 Train: [41] [4400/6250] eta: 0:08:43 lr: 0.000084 grad: 0.1132 (0.1065) loss: 0.7986 (0.8064) time: 0.2686 data: 0.0002 max mem: 26157 Train: [41] [4500/6250] eta: 0:08:14 lr: 0.000084 grad: 0.1065 (0.1066) loss: 0.7982 (0.8062) time: 0.2687 data: 0.0002 max mem: 26157 Train: [41] [4600/6250] eta: 0:07:46 lr: 0.000084 grad: 0.1050 (0.1067) loss: 0.8043 (0.8062) time: 0.2682 data: 0.0002 max mem: 26157 Train: [41] [4700/6250] eta: 0:07:17 lr: 0.000084 grad: 0.1060 (0.1070) loss: 0.8055 (0.8061) time: 0.2692 data: 0.0002 max mem: 26157 Train: [41] [4800/6250] eta: 0:06:48 lr: 0.000084 grad: 0.1094 (0.1071) loss: 0.8053 (0.8061) time: 0.2694 data: 0.0002 max mem: 26157 Train: [41] [4900/6250] eta: 0:06:20 lr: 0.000084 grad: 0.1051 (0.1072) loss: 0.7986 (0.8061) time: 0.2715 data: 0.0002 max mem: 26157 Train: [41] [5000/6250] eta: 0:05:53 lr: 0.000084 grad: 0.1022 (0.1073) loss: 0.8109 (0.8061) time: 0.2697 data: 0.0002 max mem: 26157 Train: [41] [5100/6250] eta: 0:05:25 lr: 0.000084 grad: 0.1106 (0.1073) loss: 0.8093 (0.8062) time: 0.2704 data: 0.0002 max mem: 26157 Train: [41] [5200/6250] eta: 0:04:56 lr: 0.000084 grad: 0.0994 (0.1074) loss: 0.8107 (0.8061) time: 0.2684 data: 0.0002 max mem: 26157 Train: [41] [5300/6250] eta: 0:04:28 lr: 0.000084 grad: 0.1075 (0.1074) loss: 0.8093 (0.8062) time: 0.3278 data: 0.0529 max mem: 26157 Train: [41] [5400/6250] eta: 0:04:00 lr: 0.000084 grad: 0.1006 (0.1074) loss: 0.8026 (0.8062) time: 0.2724 data: 0.0002 max mem: 26157 Train: [41] [5500/6250] eta: 0:03:31 lr: 0.000084 grad: 0.1049 (0.1074) loss: 0.8033 (0.8062) time: 0.2699 data: 0.0002 max mem: 26157 Train: [41] [5600/6250] eta: 0:03:03 lr: 0.000084 grad: 0.1103 (0.1074) loss: 0.8117 (0.8062) time: 0.2729 data: 0.0002 max mem: 26157 Train: [41] [5700/6250] eta: 0:02:34 lr: 0.000084 grad: 0.1037 (0.1074) loss: 0.8069 (0.8062) time: 0.2704 data: 0.0002 max mem: 26157 Train: [41] [5800/6250] eta: 0:02:06 lr: 0.000084 grad: 0.1028 (0.1074) loss: 0.8131 (0.8062) time: 0.2691 data: 0.0002 max mem: 26157 Train: [41] [5900/6250] eta: 0:01:38 lr: 0.000084 grad: 0.1066 (0.1073) loss: 0.8042 (0.8063) time: 0.2706 data: 0.0002 max mem: 26157 Train: [41] [6000/6250] eta: 0:01:10 lr: 0.000084 grad: 0.1075 (0.1074) loss: 0.7989 (0.8063) time: 0.2684 data: 0.0002 max mem: 26157 Train: [41] [6100/6250] eta: 0:00:42 lr: 0.000084 grad: 0.1070 (0.1074) loss: 0.8037 (0.8062) time: 0.2705 data: 0.0002 max mem: 26157 Train: [41] [6200/6250] eta: 0:00:14 lr: 0.000084 grad: 0.1027 (0.1075) loss: 0.8127 (0.8062) time: 0.2730 data: 0.0002 max mem: 26157 Train: [41] [6249/6250] eta: 0:00:00 lr: 0.000084 grad: 0.1070 (0.1075) loss: 0.8025 (0.8062) time: 0.2685 data: 0.0002 max mem: 26157 Train: [41] Total time: 0:29:23 (0.2821 s / it) Averaged stats: lr: 0.000084 grad: 0.1070 (0.1075) loss: 0.8025 (0.8062) Eval (hcp-train-subset): [41] [ 0/62] eta: 0:03:25 loss: 0.8341 (0.8341) time: 3.3181 data: 3.1990 max mem: 26157 Eval (hcp-train-subset): [41] [61/62] eta: 0:00:00 loss: 0.8211 (0.8255) time: 0.1311 data: 0.0461 max mem: 26157 Eval (hcp-train-subset): [41] Total time: 0:00:13 (0.2119 s / it) Averaged stats (hcp-train-subset): loss: 0.8211 (0.8255) Making plots (hcp-train-subset): example=52 Eval (hcp-val): [41] [ 0/62] eta: 0:05:14 loss: 0.8235 (0.8235) time: 5.0685 data: 4.9850 max mem: 26157 Eval (hcp-val): [41] [61/62] eta: 0:00:00 loss: 0.8253 (0.8266) time: 0.1280 data: 0.0453 max mem: 26157 Eval (hcp-val): [41] Total time: 0:00:13 (0.2123 s / it) Averaged stats (hcp-val): loss: 0.8253 (0.8266) Making plots (hcp-val): example=33 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [42] [ 0/6250] eta: 6:53:38 lr: 0.000084 grad: 0.0863 (0.0863) loss: 0.8312 (0.8312) time: 3.9709 data: 3.6112 max mem: 26157 Train: [42] [ 100/6250] eta: 0:33:16 lr: 0.000084 grad: 0.0946 (0.1138) loss: 0.8339 (0.8336) time: 0.2706 data: 0.0002 max mem: 26157 Train: [42] [ 200/6250] eta: 0:30:02 lr: 0.000084 grad: 0.1013 (0.1138) loss: 0.8065 (0.8242) time: 0.2725 data: 0.0002 max mem: 26157 Train: [42] [ 300/6250] eta: 0:28:37 lr: 0.000084 grad: 0.0982 (0.1122) loss: 0.8089 (0.8187) time: 0.2686 data: 0.0002 max mem: 26157 Train: [42] [ 400/6250] eta: 0:27:39 lr: 0.000084 grad: 0.1086 (0.1109) loss: 0.8097 (0.8152) time: 0.2698 data: 0.0002 max mem: 26157 Train: [42] [ 500/6250] eta: 0:26:57 lr: 0.000084 grad: 0.1050 (0.1102) loss: 0.8013 (0.8128) time: 0.2706 data: 0.0002 max mem: 26157 Train: [42] [ 600/6250] eta: 0:26:17 lr: 0.000084 grad: 0.1100 (0.1095) loss: 0.8106 (0.8112) time: 0.2685 data: 0.0002 max mem: 26157 Train: [42] [ 700/6250] eta: 0:26:00 lr: 0.000084 grad: 0.1076 (0.1096) loss: 0.8058 (0.8099) time: 0.3675 data: 0.0971 max mem: 26157 Train: [42] [ 800/6250] eta: 0:25:40 lr: 0.000084 grad: 0.1118 (0.1097) loss: 0.7971 (0.8087) time: 0.2689 data: 0.0002 max mem: 26157 Train: [42] [ 900/6250] eta: 0:25:10 lr: 0.000083 grad: 0.1095 (0.1097) loss: 0.8074 (0.8080) time: 0.3157 data: 0.0436 max mem: 26157 Train: [42] [1000/6250] eta: 0:24:35 lr: 0.000083 grad: 0.1020 (0.1096) loss: 0.8026 (0.8074) time: 0.2691 data: 0.0002 max mem: 26157 Train: [42] [1100/6250] eta: 0:24:02 lr: 0.000083 grad: 0.1029 (0.1093) loss: 0.8014 (0.8070) time: 0.2686 data: 0.0002 max mem: 26157 Train: [42] [1200/6250] eta: 0:23:29 lr: 0.000083 grad: 0.1001 (0.1092) loss: 0.8031 (0.8065) time: 0.2688 data: 0.0002 max mem: 26157 Train: [42] [1300/6250] eta: 0:22:57 lr: 0.000083 grad: 0.1048 (0.1092) loss: 0.8032 (0.8061) time: 0.2682 data: 0.0001 max mem: 26157 Train: [42] [1400/6250] eta: 0:22:26 lr: 0.000083 grad: 0.1095 (0.1094) loss: 0.8019 (0.8057) time: 0.2678 data: 0.0002 max mem: 26157 Train: [42] [1500/6250] eta: 0:21:56 lr: 0.000083 grad: 0.1085 (0.1097) loss: 0.7957 (0.8053) time: 0.2734 data: 0.0002 max mem: 26157 Train: [42] [1600/6250] eta: 0:21:27 lr: 0.000083 grad: 0.1128 (0.1100) loss: 0.7998 (0.8049) time: 0.2719 data: 0.0002 max mem: 26157 Train: [42] [1700/6250] eta: 0:20:57 lr: 0.000083 grad: 0.1112 (0.1102) loss: 0.7900 (0.8044) time: 0.2690 data: 0.0002 max mem: 26157 Train: [42] [1800/6250] eta: 0:20:28 lr: 0.000083 grad: 0.1058 (0.1103) loss: 0.7931 (0.8040) time: 0.2696 data: 0.0002 max mem: 26157 Train: [42] [1900/6250] eta: 0:19:59 lr: 0.000083 grad: 0.1113 (0.1107) loss: 0.7934 (0.8035) time: 0.2701 data: 0.0002 max mem: 26157 Train: [42] [2000/6250] eta: 0:19:30 lr: 0.000083 grad: 0.1080 (0.1107) loss: 0.7950 (0.8031) time: 0.2696 data: 0.0002 max mem: 26157 Train: [42] [2100/6250] eta: 0:19:01 lr: 0.000083 grad: 0.1145 (0.1107) loss: 0.7885 (0.8029) time: 0.2702 data: 0.0002 max mem: 26157 Train: [42] [2200/6250] eta: 0:18:33 lr: 0.000083 grad: 0.1045 (0.1109) loss: 0.7984 (0.8026) time: 0.2688 data: 0.0002 max mem: 26157 Train: [42] [2300/6250] eta: 0:18:04 lr: 0.000083 grad: 0.1105 (0.1111) loss: 0.7969 (0.8024) time: 0.2719 data: 0.0002 max mem: 26157 Train: [42] [2400/6250] eta: 0:17:36 lr: 0.000083 grad: 0.1119 (0.1112) loss: 0.7907 (0.8021) time: 0.2712 data: 0.0002 max mem: 26157 Train: [42] [2500/6250] eta: 0:17:08 lr: 0.000083 grad: 0.1044 (0.1112) loss: 0.7967 (0.8020) time: 0.2682 data: 0.0002 max mem: 26157 Train: [42] [2600/6250] eta: 0:16:40 lr: 0.000083 grad: 0.1103 (0.1115) loss: 0.7954 (0.8017) time: 0.2689 data: 0.0002 max mem: 26157 Train: [42] [2700/6250] eta: 0:16:12 lr: 0.000083 grad: 0.1155 (0.1117) loss: 0.8015 (0.8015) time: 0.2695 data: 0.0002 max mem: 26157 Train: [42] [2800/6250] eta: 0:15:44 lr: 0.000083 grad: 0.1115 (0.1118) loss: 0.7945 (0.8013) time: 0.2684 data: 0.0002 max mem: 26157 Train: [42] [2900/6250] eta: 0:15:16 lr: 0.000083 grad: 0.1136 (0.1118) loss: 0.8004 (0.8012) time: 0.2684 data: 0.0002 max mem: 26157 Train: [42] [3000/6250] eta: 0:14:48 lr: 0.000083 grad: 0.1080 (0.1119) loss: 0.8007 (0.8011) time: 0.2695 data: 0.0002 max mem: 26157 Train: [42] [3100/6250] eta: 0:14:20 lr: 0.000083 grad: 0.1063 (0.1118) loss: 0.7955 (0.8011) time: 0.2724 data: 0.0002 max mem: 26157 Train: [42] [3200/6250] eta: 0:13:52 lr: 0.000083 grad: 0.1112 (0.1119) loss: 0.7998 (0.8010) time: 0.2691 data: 0.0002 max mem: 26157 Train: [42] [3300/6250] eta: 0:13:25 lr: 0.000083 grad: 0.1022 (0.1117) loss: 0.8028 (0.8011) time: 0.2906 data: 0.0004 max mem: 26157 Train: [42] [3400/6250] eta: 0:12:58 lr: 0.000083 grad: 0.1035 (0.1116) loss: 0.8016 (0.8011) time: 0.2715 data: 0.0002 max mem: 26157 Train: [42] [3500/6250] eta: 0:12:31 lr: 0.000083 grad: 0.1041 (0.1116) loss: 0.8000 (0.8010) time: 0.2700 data: 0.0002 max mem: 26157 Train: [42] [3600/6250] eta: 0:12:04 lr: 0.000083 grad: 0.1048 (0.1115) loss: 0.8034 (0.8009) time: 0.2691 data: 0.0002 max mem: 26157 Train: [42] [3700/6250] eta: 0:11:36 lr: 0.000083 grad: 0.0998 (0.1115) loss: 0.8050 (0.8009) time: 0.2700 data: 0.0002 max mem: 26157 Train: [42] [3800/6250] eta: 0:11:09 lr: 0.000083 grad: 0.1034 (0.1114) loss: 0.8097 (0.8009) time: 0.2723 data: 0.0002 max mem: 26157 Train: [42] [3900/6250] eta: 0:10:41 lr: 0.000083 grad: 0.1064 (0.1114) loss: 0.8012 (0.8009) time: 0.2715 data: 0.0002 max mem: 26157 Train: [42] [4000/6250] eta: 0:10:14 lr: 0.000083 grad: 0.1036 (0.1115) loss: 0.8011 (0.8009) time: 0.2697 data: 0.0002 max mem: 26157 Train: [42] [4100/6250] eta: 0:09:48 lr: 0.000082 grad: 0.1091 (0.1115) loss: 0.8003 (0.8010) time: 0.2688 data: 0.0001 max mem: 26157 Train: [42] [4200/6250] eta: 0:09:21 lr: 0.000082 grad: 0.1044 (0.1116) loss: 0.7996 (0.8010) time: 0.2733 data: 0.0002 max mem: 26157 Train: [42] [4300/6250] eta: 0:08:53 lr: 0.000082 grad: 0.1074 (0.1117) loss: 0.8052 (0.8012) time: 0.2706 data: 0.0002 max mem: 26157 Train: [42] [4400/6250] eta: 0:08:26 lr: 0.000082 grad: 0.1278 (0.1119) loss: 0.8048 (0.8012) time: 0.2733 data: 0.0002 max mem: 26157 Train: [42] [4500/6250] eta: 0:07:58 lr: 0.000082 grad: 0.1208 (0.1120) loss: 0.8085 (0.8013) time: 0.2696 data: 0.0002 max mem: 26157 Train: [42] [4600/6250] eta: 0:07:31 lr: 0.000082 grad: 0.1086 (0.1124) loss: 0.8041 (0.8014) time: 0.2692 data: 0.0002 max mem: 26157 Train: [42] [4700/6250] eta: 0:07:03 lr: 0.000082 grad: 0.1126 (0.1125) loss: 0.8049 (0.8016) time: 0.2699 data: 0.0002 max mem: 26157 Train: [42] [4800/6250] eta: 0:06:36 lr: 0.000082 grad: 0.1102 (0.1127) loss: 0.8026 (0.8016) time: 0.2708 data: 0.0002 max mem: 26157 Train: [42] [4900/6250] eta: 0:06:08 lr: 0.000082 grad: 0.1083 (0.1128) loss: 0.8079 (0.8017) time: 0.2692 data: 0.0002 max mem: 26157 Train: [42] [5000/6250] eta: 0:05:41 lr: 0.000082 grad: 0.1112 (0.1128) loss: 0.8006 (0.8017) time: 0.2691 data: 0.0002 max mem: 26157 Train: [42] [5100/6250] eta: 0:05:14 lr: 0.000082 grad: 0.1077 (0.1128) loss: 0.8092 (0.8017) time: 0.2688 data: 0.0002 max mem: 26157 Train: [42] [5200/6250] eta: 0:04:46 lr: 0.000082 grad: 0.1045 (0.1127) loss: 0.8121 (0.8019) time: 0.2726 data: 0.0002 max mem: 26157 Train: [42] [5300/6250] eta: 0:04:19 lr: 0.000082 grad: 0.1131 (0.1128) loss: 0.8036 (0.8019) time: 0.2694 data: 0.0002 max mem: 26157 Train: [42] [5400/6250] eta: 0:03:51 lr: 0.000082 grad: 0.1147 (0.1129) loss: 0.8054 (0.8019) time: 0.2692 data: 0.0002 max mem: 26157 Train: [42] [5500/6250] eta: 0:03:24 lr: 0.000082 grad: 0.1170 (0.1130) loss: 0.8044 (0.8019) time: 0.2684 data: 0.0001 max mem: 26157 Train: [42] [5600/6250] eta: 0:02:57 lr: 0.000082 grad: 0.1031 (0.1130) loss: 0.8037 (0.8020) time: 0.2681 data: 0.0002 max mem: 26157 Train: [42] [5700/6250] eta: 0:02:29 lr: 0.000082 grad: 0.1112 (0.1130) loss: 0.8051 (0.8020) time: 0.2689 data: 0.0002 max mem: 26157 Train: [42] [5800/6250] eta: 0:02:02 lr: 0.000082 grad: 0.1142 (0.1130) loss: 0.8020 (0.8021) time: 0.2693 data: 0.0002 max mem: 26157 Train: [42] [5900/6250] eta: 0:01:35 lr: 0.000082 grad: 0.1085 (0.1130) loss: 0.8038 (0.8021) time: 0.2721 data: 0.0002 max mem: 26157 Train: [42] [6000/6250] eta: 0:01:08 lr: 0.000082 grad: 0.1017 (0.1130) loss: 0.8035 (0.8021) time: 0.2700 data: 0.0002 max mem: 26157 Train: [42] [6100/6250] eta: 0:00:40 lr: 0.000082 grad: 0.1062 (0.1132) loss: 0.8106 (0.8021) time: 0.2722 data: 0.0002 max mem: 26157 Train: [42] [6200/6250] eta: 0:00:13 lr: 0.000082 grad: 0.1162 (0.1132) loss: 0.8009 (0.8022) time: 0.2708 data: 0.0002 max mem: 26157 Train: [42] [6249/6250] eta: 0:00:00 lr: 0.000082 grad: 0.1215 (0.1132) loss: 0.8002 (0.8022) time: 0.2687 data: 0.0002 max mem: 26157 Train: [42] Total time: 0:28:29 (0.2735 s / it) Averaged stats: lr: 0.000082 grad: 0.1215 (0.1132) loss: 0.8002 (0.8022) Eval (hcp-train-subset): [42] [ 0/62] eta: 0:05:03 loss: 0.8303 (0.8303) time: 4.9029 data: 4.8185 max mem: 26157 Eval (hcp-train-subset): [42] [61/62] eta: 0:00:00 loss: 0.8231 (0.8248) time: 0.1277 data: 0.0426 max mem: 26157 Eval (hcp-train-subset): [42] Total time: 0:00:13 (0.2235 s / it) Averaged stats (hcp-train-subset): loss: 0.8231 (0.8248) Making plots (hcp-train-subset): example=46 Eval (hcp-val): [42] [ 0/62] eta: 0:05:54 loss: 0.8228 (0.8228) time: 5.7166 data: 5.6322 max mem: 26157 Eval (hcp-val): [42] [61/62] eta: 0:00:00 loss: 0.8263 (0.8270) time: 0.1393 data: 0.0543 max mem: 26157 Eval (hcp-val): [42] Total time: 0:00:13 (0.2187 s / it) Averaged stats (hcp-val): loss: 0.8263 (0.8270) Making plots (hcp-val): example=45 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [43] [ 0/6250] eta: 9:07:24 lr: 0.000082 grad: 0.0717 (0.0717) loss: 0.8529 (0.8529) time: 5.2550 data: 4.8607 max mem: 26157 Train: [43] [ 100/6250] eta: 0:33:43 lr: 0.000082 grad: 0.1010 (0.1140) loss: 0.8145 (0.8287) time: 0.2703 data: 0.0002 max mem: 26157 Train: [43] [ 200/6250] eta: 0:30:13 lr: 0.000082 grad: 0.1044 (0.1145) loss: 0.8003 (0.8200) time: 0.2704 data: 0.0002 max mem: 26157 Train: [43] [ 300/6250] eta: 0:28:46 lr: 0.000082 grad: 0.1094 (0.1141) loss: 0.8076 (0.8162) time: 0.2712 data: 0.0002 max mem: 26157 Train: [43] [ 400/6250] eta: 0:27:48 lr: 0.000082 grad: 0.1053 (0.1130) loss: 0.8034 (0.8139) time: 0.2706 data: 0.0002 max mem: 26157 Train: [43] [ 500/6250] eta: 0:27:03 lr: 0.000082 grad: 0.1059 (0.1119) loss: 0.8057 (0.8127) time: 0.2688 data: 0.0002 max mem: 26157 Train: [43] [ 600/6250] eta: 0:26:25 lr: 0.000082 grad: 0.0984 (0.1107) loss: 0.8100 (0.8120) time: 0.2706 data: 0.0002 max mem: 26157 Train: [43] [ 700/6250] eta: 0:25:49 lr: 0.000082 grad: 0.1052 (0.1098) loss: 0.8094 (0.8113) time: 0.2692 data: 0.0002 max mem: 26157 Train: [43] [ 800/6250] eta: 0:25:15 lr: 0.000082 grad: 0.1089 (0.1094) loss: 0.8168 (0.8112) time: 0.2709 data: 0.0002 max mem: 26157 Train: [43] [ 900/6250] eta: 0:24:44 lr: 0.000082 grad: 0.1021 (0.1096) loss: 0.8034 (0.8109) time: 0.2727 data: 0.0002 max mem: 26157 Train: [43] [1000/6250] eta: 0:24:14 lr: 0.000081 grad: 0.1066 (0.1091) loss: 0.8081 (0.8107) time: 0.2764 data: 0.0002 max mem: 26157 Train: [43] [1100/6250] eta: 0:23:48 lr: 0.000081 grad: 0.1022 (0.1088) loss: 0.8106 (0.8107) time: 0.2702 data: 0.0002 max mem: 26157 Train: [43] [1200/6250] eta: 0:23:16 lr: 0.000081 grad: 0.1132 (0.1087) loss: 0.8051 (0.8103) time: 0.2686 data: 0.0002 max mem: 26157 Train: [43] [1300/6250] eta: 0:22:46 lr: 0.000081 grad: 0.0952 (0.1087) loss: 0.8075 (0.8102) time: 0.2684 data: 0.0002 max mem: 26157 Train: [43] [1400/6250] eta: 0:22:18 lr: 0.000081 grad: 0.1016 (0.1086) loss: 0.8097 (0.8099) time: 0.2734 data: 0.0003 max mem: 26157 Train: [43] [1500/6250] eta: 0:22:04 lr: 0.000081 grad: 0.1046 (0.1084) loss: 0.8079 (0.8097) time: 0.3942 data: 0.1238 max mem: 26157 Train: [43] [1600/6250] eta: 0:21:34 lr: 0.000081 grad: 0.0980 (0.1083) loss: 0.8062 (0.8094) time: 0.2721 data: 0.0002 max mem: 26157 Train: [43] [1700/6250] eta: 0:21:04 lr: 0.000081 grad: 0.1037 (0.1083) loss: 0.8057 (0.8093) time: 0.2676 data: 0.0002 max mem: 26157 Train: [43] [1800/6250] eta: 0:20:34 lr: 0.000081 grad: 0.1112 (0.1084) loss: 0.8035 (0.8092) time: 0.2696 data: 0.0002 max mem: 26157 Train: [43] [1900/6250] eta: 0:20:04 lr: 0.000081 grad: 0.1032 (0.1083) loss: 0.8045 (0.8090) time: 0.2698 data: 0.0002 max mem: 26157 Train: [43] [2000/6250] eta: 0:19:35 lr: 0.000081 grad: 0.1029 (0.1081) loss: 0.8012 (0.8090) time: 0.2697 data: 0.0002 max mem: 26157 Train: [43] [2100/6250] eta: 0:19:06 lr: 0.000081 grad: 0.1032 (0.1081) loss: 0.8108 (0.8089) time: 0.2713 data: 0.0002 max mem: 26157 Train: [43] [2200/6250] eta: 0:18:37 lr: 0.000081 grad: 0.0976 (0.1081) loss: 0.8127 (0.8089) time: 0.2695 data: 0.0002 max mem: 26157 Train: [43] [2300/6250] eta: 0:18:08 lr: 0.000081 grad: 0.1058 (0.1079) loss: 0.8094 (0.8089) time: 0.2693 data: 0.0002 max mem: 26157 Train: [43] [2400/6250] eta: 0:17:40 lr: 0.000081 grad: 0.1113 (0.1083) loss: 0.8078 (0.8088) time: 0.2705 data: 0.0002 max mem: 26157 Train: [43] [2500/6250] eta: 0:17:25 lr: 0.000081 grad: 0.1101 (0.1083) loss: 0.8093 (0.8089) time: 0.2683 data: 0.0002 max mem: 26157 Train: [43] [2600/6250] eta: 0:16:56 lr: 0.000081 grad: 0.1089 (0.1083) loss: 0.8012 (0.8088) time: 0.2687 data: 0.0002 max mem: 26157 Train: [43] [2700/6250] eta: 0:16:27 lr: 0.000081 grad: 0.1150 (0.1083) loss: 0.8096 (0.8087) time: 0.2697 data: 0.0002 max mem: 26157 Train: [43] [2800/6250] eta: 0:15:58 lr: 0.000081 grad: 0.1033 (0.1085) loss: 0.8049 (0.8086) time: 0.2688 data: 0.0002 max mem: 26157 Train: [43] [2900/6250] eta: 0:15:29 lr: 0.000081 grad: 0.1156 (0.1087) loss: 0.8092 (0.8085) time: 0.2689 data: 0.0002 max mem: 26157 Train: [43] [3000/6250] eta: 0:15:00 lr: 0.000081 grad: 0.1073 (0.1091) loss: 0.8022 (0.8085) time: 0.2684 data: 0.0002 max mem: 26157 Train: [43] [3100/6250] eta: 0:14:32 lr: 0.000081 grad: 0.1072 (0.1092) loss: 0.8086 (0.8084) time: 0.2690 data: 0.0002 max mem: 26157 Train: [43] [3200/6250] eta: 0:14:03 lr: 0.000081 grad: 0.1092 (0.1093) loss: 0.7957 (0.8083) time: 0.2689 data: 0.0002 max mem: 26157 Train: [43] [3300/6250] eta: 0:13:35 lr: 0.000081 grad: 0.1136 (0.1096) loss: 0.7984 (0.8082) time: 0.2690 data: 0.0002 max mem: 26157 Train: [43] [3400/6250] eta: 0:13:07 lr: 0.000081 grad: 0.1121 (0.1098) loss: 0.8114 (0.8081) time: 0.2677 data: 0.0002 max mem: 26157 Train: [43] [3500/6250] eta: 0:12:38 lr: 0.000081 grad: 0.1066 (0.1098) loss: 0.8052 (0.8079) time: 0.2688 data: 0.0002 max mem: 26157 Train: [43] [3600/6250] eta: 0:12:10 lr: 0.000081 grad: 0.1133 (0.1099) loss: 0.8026 (0.8077) time: 0.2721 data: 0.0002 max mem: 26157 Train: [43] [3700/6250] eta: 0:11:42 lr: 0.000081 grad: 0.1035 (0.1100) loss: 0.8052 (0.8076) time: 0.2680 data: 0.0002 max mem: 26157 Train: [43] [3800/6250] eta: 0:11:14 lr: 0.000081 grad: 0.1108 (0.1101) loss: 0.8048 (0.8075) time: 0.2701 data: 0.0002 max mem: 26157 Train: [43] [3900/6250] eta: 0:10:46 lr: 0.000081 grad: 0.1119 (0.1101) loss: 0.7954 (0.8074) time: 0.2685 data: 0.0002 max mem: 26157 Train: [43] [4000/6250] eta: 0:10:19 lr: 0.000081 grad: 0.1083 (0.1102) loss: 0.8050 (0.8072) time: 0.2691 data: 0.0002 max mem: 26157 Train: [43] [4100/6250] eta: 0:09:51 lr: 0.000081 grad: 0.1113 (0.1104) loss: 0.7936 (0.8071) time: 0.2682 data: 0.0002 max mem: 26157 Train: [43] [4200/6250] eta: 0:09:23 lr: 0.000080 grad: 0.1098 (0.1105) loss: 0.8084 (0.8071) time: 0.2689 data: 0.0002 max mem: 26157 Train: [43] [4300/6250] eta: 0:08:55 lr: 0.000080 grad: 0.1113 (0.1105) loss: 0.7944 (0.8070) time: 0.2688 data: 0.0002 max mem: 26157 Train: [43] [4400/6250] eta: 0:08:28 lr: 0.000080 grad: 0.1127 (0.1105) loss: 0.8083 (0.8069) time: 0.2692 data: 0.0002 max mem: 26157 Train: [43] [4500/6250] eta: 0:08:00 lr: 0.000080 grad: 0.1079 (0.1106) loss: 0.7990 (0.8068) time: 0.2733 data: 0.0002 max mem: 26157 Train: [43] [4600/6250] eta: 0:07:32 lr: 0.000080 grad: 0.1097 (0.1106) loss: 0.8020 (0.8066) time: 0.2767 data: 0.0002 max mem: 26157 Train: [43] [4700/6250] eta: 0:07:05 lr: 0.000080 grad: 0.1142 (0.1108) loss: 0.8017 (0.8066) time: 0.2724 data: 0.0002 max mem: 26157 Train: [43] [4800/6250] eta: 0:06:39 lr: 0.000080 grad: 0.1030 (0.1108) loss: 0.8044 (0.8065) time: 0.2741 data: 0.0002 max mem: 26157 Train: [43] [4900/6250] eta: 0:06:11 lr: 0.000080 grad: 0.1043 (0.1108) loss: 0.8072 (0.8065) time: 0.2712 data: 0.0002 max mem: 26157 Train: [43] [5000/6250] eta: 0:05:44 lr: 0.000080 grad: 0.1091 (0.1109) loss: 0.8012 (0.8065) time: 0.2691 data: 0.0002 max mem: 26157 Train: [43] [5100/6250] eta: 0:05:16 lr: 0.000080 grad: 0.1006 (0.1109) loss: 0.8076 (0.8065) time: 0.2700 data: 0.0002 max mem: 26157 Train: [43] [5200/6250] eta: 0:04:49 lr: 0.000080 grad: 0.1081 (0.1109) loss: 0.8013 (0.8065) time: 0.2694 data: 0.0002 max mem: 26157 Train: [43] [5300/6250] eta: 0:04:21 lr: 0.000080 grad: 0.1079 (0.1109) loss: 0.8017 (0.8065) time: 0.2697 data: 0.0002 max mem: 26157 Train: [43] [5400/6250] eta: 0:03:53 lr: 0.000080 grad: 0.1116 (0.1109) loss: 0.8088 (0.8065) time: 0.2736 data: 0.0002 max mem: 26157 Train: [43] [5500/6250] eta: 0:03:26 lr: 0.000080 grad: 0.1066 (0.1110) loss: 0.8059 (0.8065) time: 0.2682 data: 0.0002 max mem: 26157 Train: [43] [5600/6250] eta: 0:02:58 lr: 0.000080 grad: 0.1034 (0.1111) loss: 0.8097 (0.8065) time: 0.2687 data: 0.0002 max mem: 26157 Train: [43] [5700/6250] eta: 0:02:31 lr: 0.000080 grad: 0.1000 (0.1111) loss: 0.8158 (0.8065) time: 0.2687 data: 0.0002 max mem: 26157 Train: [43] [5800/6250] eta: 0:02:03 lr: 0.000080 grad: 0.1083 (0.1110) loss: 0.8021 (0.8065) time: 0.2689 data: 0.0002 max mem: 26157 Train: [43] [5900/6250] eta: 0:01:36 lr: 0.000080 grad: 0.1120 (0.1110) loss: 0.8049 (0.8065) time: 0.2725 data: 0.0002 max mem: 26157 Train: [43] [6000/6250] eta: 0:01:09 lr: 0.000080 grad: 0.1060 (0.1109) loss: 0.8097 (0.8065) time: 0.2690 data: 0.0001 max mem: 26157 Train: [43] [6100/6250] eta: 0:00:41 lr: 0.000080 grad: 0.1049 (0.1109) loss: 0.7984 (0.8065) time: 0.2697 data: 0.0002 max mem: 26157 Train: [43] [6200/6250] eta: 0:00:13 lr: 0.000080 grad: 0.1037 (0.1109) loss: 0.8089 (0.8065) time: 0.2687 data: 0.0002 max mem: 26157 Train: [43] [6249/6250] eta: 0:00:00 lr: 0.000080 grad: 0.1063 (0.1109) loss: 0.8090 (0.8065) time: 0.2701 data: 0.0002 max mem: 26157 Train: [43] Total time: 0:28:56 (0.2778 s / it) Averaged stats: lr: 0.000080 grad: 0.1063 (0.1109) loss: 0.8090 (0.8065) Eval (hcp-train-subset): [43] [ 0/62] eta: 0:03:55 loss: 0.8356 (0.8356) time: 3.8025 data: 3.6804 max mem: 26157 Eval (hcp-train-subset): [43] [61/62] eta: 0:00:00 loss: 0.8210 (0.8229) time: 0.1047 data: 0.0221 max mem: 26157 Eval (hcp-train-subset): [43] Total time: 0:00:12 (0.2007 s / it) Averaged stats (hcp-train-subset): loss: 0.8210 (0.8229) Making plots (hcp-train-subset): example=29 Eval (hcp-val): [43] [ 0/62] eta: 0:04:54 loss: 0.8209 (0.8209) time: 4.7487 data: 4.6639 max mem: 26157 Eval (hcp-val): [43] [61/62] eta: 0:00:00 loss: 0.8238 (0.8264) time: 0.1347 data: 0.0503 max mem: 26157 Eval (hcp-val): [43] Total time: 0:00:13 (0.2100 s / it) Averaged stats (hcp-val): loss: 0.8238 (0.8264) Making plots (hcp-val): example=51 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [44] [ 0/6250] eta: 10:06:22 lr: 0.000080 grad: 0.2292 (0.2292) loss: 0.8479 (0.8479) time: 5.8212 data: 5.5185 max mem: 26157 Train: [44] [ 100/6250] eta: 0:34:45 lr: 0.000080 grad: 0.0970 (0.1264) loss: 0.8311 (0.8178) time: 0.2708 data: 0.0002 max mem: 26157 Train: [44] [ 200/6250] eta: 0:30:43 lr: 0.000080 grad: 0.1115 (0.1288) loss: 0.8016 (0.8096) time: 0.2689 data: 0.0002 max mem: 26157 Train: [44] [ 300/6250] eta: 0:29:07 lr: 0.000080 grad: 0.1143 (0.1259) loss: 0.7938 (0.8057) time: 0.2735 data: 0.0002 max mem: 26157 Train: [44] [ 400/6250] eta: 0:28:03 lr: 0.000080 grad: 0.1179 (0.1257) loss: 0.8002 (0.8042) time: 0.2704 data: 0.0002 max mem: 26157 Train: [44] [ 500/6250] eta: 0:27:13 lr: 0.000080 grad: 0.1070 (0.1237) loss: 0.8034 (0.8036) time: 0.2695 data: 0.0002 max mem: 26157 Train: [44] [ 600/6250] eta: 0:26:32 lr: 0.000080 grad: 0.1041 (0.1225) loss: 0.8029 (0.8036) time: 0.2695 data: 0.0002 max mem: 26157 Train: [44] [ 700/6250] eta: 0:25:53 lr: 0.000080 grad: 0.1134 (0.1214) loss: 0.8062 (0.8033) time: 0.2686 data: 0.0002 max mem: 26157 Train: [44] [ 800/6250] eta: 0:25:19 lr: 0.000080 grad: 0.1039 (0.1201) loss: 0.8002 (0.8027) time: 0.2691 data: 0.0002 max mem: 26157 Train: [44] [ 900/6250] eta: 0:24:46 lr: 0.000080 grad: 0.1150 (0.1194) loss: 0.7990 (0.8025) time: 0.2688 data: 0.0002 max mem: 26157 Train: [44] [1000/6250] eta: 0:24:14 lr: 0.000080 grad: 0.1080 (0.1185) loss: 0.7996 (0.8027) time: 0.2698 data: 0.0002 max mem: 26157 Train: [44] [1100/6250] eta: 0:23:43 lr: 0.000079 grad: 0.1086 (0.1177) loss: 0.8055 (0.8027) time: 0.2702 data: 0.0003 max mem: 26157 Train: [44] [1200/6250] eta: 0:23:13 lr: 0.000079 grad: 0.1038 (0.1171) loss: 0.8092 (0.8027) time: 0.2696 data: 0.0002 max mem: 26157 Train: [44] [1300/6250] eta: 0:23:01 lr: 0.000079 grad: 0.1024 (0.1166) loss: 0.7960 (0.8027) time: 0.2751 data: 0.0003 max mem: 26157 Train: [44] [1400/6250] eta: 0:22:32 lr: 0.000079 grad: 0.1058 (0.1164) loss: 0.8113 (0.8025) time: 0.2723 data: 0.0002 max mem: 26157 Train: [44] [1500/6250] eta: 0:22:01 lr: 0.000079 grad: 0.1123 (0.1163) loss: 0.7994 (0.8024) time: 0.2685 data: 0.0001 max mem: 26157 Train: [44] [1600/6250] eta: 0:21:31 lr: 0.000079 grad: 0.1106 (0.1163) loss: 0.8003 (0.8022) time: 0.2688 data: 0.0001 max mem: 26157 Train: [44] [1700/6250] eta: 0:21:02 lr: 0.000079 grad: 0.1168 (0.1164) loss: 0.7949 (0.8020) time: 0.2712 data: 0.0002 max mem: 26157 Train: [44] [1800/6250] eta: 0:20:45 lr: 0.000079 grad: 0.1160 (0.1164) loss: 0.7986 (0.8018) time: 0.2742 data: 0.0002 max mem: 26157 Train: [44] [1900/6250] eta: 0:20:15 lr: 0.000079 grad: 0.1105 (0.1164) loss: 0.7983 (0.8017) time: 0.2705 data: 0.0002 max mem: 26157 Train: [44] [2000/6250] eta: 0:19:45 lr: 0.000079 grad: 0.1100 (0.1162) loss: 0.8080 (0.8016) time: 0.2690 data: 0.0001 max mem: 26157 Train: [44] [2100/6250] eta: 0:19:15 lr: 0.000079 grad: 0.1213 (0.1162) loss: 0.8011 (0.8016) time: 0.2716 data: 0.0002 max mem: 26157 Train: [44] [2200/6250] eta: 0:18:46 lr: 0.000079 grad: 0.1065 (0.1161) loss: 0.8043 (0.8016) time: 0.2688 data: 0.0002 max mem: 26157 Train: [44] [2300/6250] eta: 0:18:16 lr: 0.000079 grad: 0.1022 (0.1158) loss: 0.8029 (0.8017) time: 0.2697 data: 0.0002 max mem: 26157 Train: [44] [2400/6250] eta: 0:17:47 lr: 0.000079 grad: 0.1077 (0.1157) loss: 0.8039 (0.8018) time: 0.2701 data: 0.0002 max mem: 26157 Train: [44] [2500/6250] eta: 0:17:18 lr: 0.000079 grad: 0.1067 (0.1155) loss: 0.8028 (0.8019) time: 0.2714 data: 0.0002 max mem: 26157 Train: [44] [2600/6250] eta: 0:16:52 lr: 0.000079 grad: 0.1032 (0.1152) loss: 0.7983 (0.8019) time: 0.2697 data: 0.0002 max mem: 26157 Train: [44] [2700/6250] eta: 0:16:29 lr: 0.000079 grad: 0.1091 (0.1150) loss: 0.7987 (0.8019) time: 0.2690 data: 0.0002 max mem: 26157 Train: [44] [2800/6250] eta: 0:16:01 lr: 0.000079 grad: 0.1169 (0.1153) loss: 0.8024 (0.8020) time: 0.2692 data: 0.0002 max mem: 26157 Train: [44] [2900/6250] eta: 0:15:32 lr: 0.000079 grad: 0.1101 (0.1151) loss: 0.8027 (0.8021) time: 0.2710 data: 0.0002 max mem: 26157 Train: [44] [3000/6250] eta: 0:15:03 lr: 0.000079 grad: 0.1062 (0.1151) loss: 0.8051 (0.8021) time: 0.2688 data: 0.0002 max mem: 26157 Train: [44] [3100/6250] eta: 0:14:34 lr: 0.000079 grad: 0.1102 (0.1151) loss: 0.8074 (0.8022) time: 0.2691 data: 0.0002 max mem: 26157 Train: [44] [3200/6250] eta: 0:14:06 lr: 0.000079 grad: 0.1068 (0.1151) loss: 0.8091 (0.8024) time: 0.2682 data: 0.0002 max mem: 26157 Train: [44] [3300/6250] eta: 0:13:37 lr: 0.000079 grad: 0.1088 (0.1150) loss: 0.8084 (0.8025) time: 0.2683 data: 0.0002 max mem: 26157 Train: [44] [3400/6250] eta: 0:13:09 lr: 0.000079 grad: 0.1010 (0.1150) loss: 0.8038 (0.8026) time: 0.2686 data: 0.0002 max mem: 26157 Train: [44] [3500/6250] eta: 0:12:41 lr: 0.000079 grad: 0.1148 (0.1149) loss: 0.7980 (0.8026) time: 0.2707 data: 0.0002 max mem: 26157 Train: [44] [3600/6250] eta: 0:12:12 lr: 0.000079 grad: 0.1068 (0.1149) loss: 0.8098 (0.8027) time: 0.2703 data: 0.0002 max mem: 26157 Train: [44] [3700/6250] eta: 0:11:44 lr: 0.000079 grad: 0.1149 (0.1149) loss: 0.8035 (0.8028) time: 0.2698 data: 0.0002 max mem: 26157 Train: [44] [3800/6250] eta: 0:11:16 lr: 0.000079 grad: 0.1061 (0.1148) loss: 0.8043 (0.8028) time: 0.2689 data: 0.0002 max mem: 26157 Train: [44] [3900/6250] eta: 0:10:48 lr: 0.000079 grad: 0.1076 (0.1150) loss: 0.7977 (0.8027) time: 0.2678 data: 0.0002 max mem: 26157 Train: [44] [4000/6250] eta: 0:10:20 lr: 0.000079 grad: 0.1072 (0.1149) loss: 0.8025 (0.8027) time: 0.2695 data: 0.0002 max mem: 26157 Train: [44] [4100/6250] eta: 0:09:52 lr: 0.000079 grad: 0.1114 (0.1149) loss: 0.7978 (0.8026) time: 0.2690 data: 0.0002 max mem: 26157 Train: [44] [4200/6250] eta: 0:09:24 lr: 0.000078 grad: 0.1053 (0.1148) loss: 0.8002 (0.8026) time: 0.2705 data: 0.0002 max mem: 26157 Train: [44] [4300/6250] eta: 0:08:57 lr: 0.000078 grad: 0.1124 (0.1148) loss: 0.7981 (0.8026) time: 0.2711 data: 0.0002 max mem: 26157 Train: [44] [4400/6250] eta: 0:08:29 lr: 0.000078 grad: 0.1179 (0.1148) loss: 0.7929 (0.8025) time: 0.2699 data: 0.0002 max mem: 26157 Train: [44] [4500/6250] eta: 0:08:01 lr: 0.000078 grad: 0.1100 (0.1149) loss: 0.8012 (0.8025) time: 0.2715 data: 0.0002 max mem: 26157 Train: [44] [4600/6250] eta: 0:07:33 lr: 0.000078 grad: 0.1114 (0.1148) loss: 0.8043 (0.8024) time: 0.2717 data: 0.0002 max mem: 26157 Train: [44] [4700/6250] eta: 0:07:06 lr: 0.000078 grad: 0.1025 (0.1148) loss: 0.8031 (0.8023) time: 0.2703 data: 0.0002 max mem: 26157 Train: [44] [4800/6250] eta: 0:06:38 lr: 0.000078 grad: 0.1137 (0.1148) loss: 0.7939 (0.8022) time: 0.2701 data: 0.0002 max mem: 26157 Train: [44] [4900/6250] eta: 0:06:10 lr: 0.000078 grad: 0.1016 (0.1148) loss: 0.8059 (0.8022) time: 0.2687 data: 0.0002 max mem: 26157 Train: [44] [5000/6250] eta: 0:05:43 lr: 0.000078 grad: 0.1132 (0.1147) loss: 0.8011 (0.8022) time: 0.2691 data: 0.0002 max mem: 26157 Train: [44] [5100/6250] eta: 0:05:15 lr: 0.000078 grad: 0.1151 (0.1147) loss: 0.8047 (0.8022) time: 0.2726 data: 0.0003 max mem: 26157 Train: [44] [5200/6250] eta: 0:04:48 lr: 0.000078 grad: 0.1084 (0.1147) loss: 0.8043 (0.8022) time: 0.2697 data: 0.0002 max mem: 26157 Train: [44] [5300/6250] eta: 0:04:20 lr: 0.000078 grad: 0.1160 (0.1147) loss: 0.8009 (0.8022) time: 0.2695 data: 0.0001 max mem: 26157 Train: [44] [5400/6250] eta: 0:03:53 lr: 0.000078 grad: 0.1116 (0.1148) loss: 0.7998 (0.8022) time: 0.2687 data: 0.0002 max mem: 26157 Train: [44] [5500/6250] eta: 0:03:25 lr: 0.000078 grad: 0.1115 (0.1150) loss: 0.8015 (0.8022) time: 0.2700 data: 0.0002 max mem: 26157 Train: [44] [5600/6250] eta: 0:02:58 lr: 0.000078 grad: 0.1098 (0.1152) loss: 0.8045 (0.8022) time: 0.2684 data: 0.0002 max mem: 26157 Train: [44] [5700/6250] eta: 0:02:30 lr: 0.000078 grad: 0.1118 (0.1152) loss: 0.8101 (0.8022) time: 0.2695 data: 0.0002 max mem: 26157 Train: [44] [5800/6250] eta: 0:02:03 lr: 0.000078 grad: 0.1088 (0.1153) loss: 0.8110 (0.8022) time: 0.2702 data: 0.0002 max mem: 26157 Train: [44] [5900/6250] eta: 0:01:35 lr: 0.000078 grad: 0.1121 (0.1154) loss: 0.8047 (0.8022) time: 0.2694 data: 0.0002 max mem: 26157 Train: [44] [6000/6250] eta: 0:01:08 lr: 0.000078 grad: 0.1148 (0.1155) loss: 0.7967 (0.8021) time: 0.2726 data: 0.0002 max mem: 26157 Train: [44] [6100/6250] eta: 0:00:41 lr: 0.000078 grad: 0.1246 (0.1156) loss: 0.7956 (0.8021) time: 0.2719 data: 0.0002 max mem: 26157 Train: [44] [6200/6250] eta: 0:00:13 lr: 0.000078 grad: 0.1159 (0.1157) loss: 0.7963 (0.8020) time: 0.2698 data: 0.0002 max mem: 26157 Train: [44] [6249/6250] eta: 0:00:00 lr: 0.000078 grad: 0.1121 (0.1157) loss: 0.8011 (0.8020) time: 0.2719 data: 0.0002 max mem: 26157 Train: [44] Total time: 0:28:37 (0.2748 s / it) Averaged stats: lr: 0.000078 grad: 0.1121 (0.1157) loss: 0.8011 (0.8020) Eval (hcp-train-subset): [44] [ 0/62] eta: 0:05:40 loss: 0.8297 (0.8297) time: 5.4899 data: 5.4064 max mem: 26157 Eval (hcp-train-subset): [44] [61/62] eta: 0:00:00 loss: 0.8246 (0.8264) time: 0.1250 data: 0.0423 max mem: 26157 Eval (hcp-train-subset): [44] Total time: 0:00:13 (0.2181 s / it) Averaged stats (hcp-train-subset): loss: 0.8246 (0.8264) Making plots (hcp-train-subset): example=7 Eval (hcp-val): [44] [ 0/62] eta: 0:06:02 loss: 0.8213 (0.8213) time: 5.8437 data: 5.7581 max mem: 26157 Eval (hcp-val): [44] [61/62] eta: 0:00:00 loss: 0.8259 (0.8269) time: 0.1109 data: 0.0285 max mem: 26157 Eval (hcp-val): [44] Total time: 0:00:13 (0.2151 s / it) Averaged stats (hcp-val): loss: 0.8259 (0.8269) Making plots (hcp-val): example=6 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [45] [ 0/6250] eta: 12:55:58 lr: 0.000078 grad: 0.1865 (0.1865) loss: 0.7921 (0.7921) time: 7.4493 data: 7.1706 max mem: 26157 Train: [45] [ 100/6250] eta: 0:36:49 lr: 0.000078 grad: 0.1300 (0.1671) loss: 0.8006 (0.8084) time: 0.3486 data: 0.0769 max mem: 26157 Train: [45] [ 200/6250] eta: 0:31:51 lr: 0.000078 grad: 0.1184 (0.1458) loss: 0.8007 (0.8027) time: 0.2701 data: 0.0002 max mem: 26157 Train: [45] [ 300/6250] eta: 0:29:50 lr: 0.000078 grad: 0.1129 (0.1354) loss: 0.7982 (0.8024) time: 0.2696 data: 0.0002 max mem: 26157 Train: [45] [ 400/6250] eta: 0:29:38 lr: 0.000078 grad: 0.1088 (0.1298) loss: 0.8114 (0.8035) time: 0.4759 data: 0.2028 max mem: 26157 Train: [45] [ 500/6250] eta: 0:28:28 lr: 0.000078 grad: 0.1128 (0.1261) loss: 0.8066 (0.8043) time: 0.2688 data: 0.0002 max mem: 26157 Train: [45] [ 600/6250] eta: 0:28:34 lr: 0.000078 grad: 0.1094 (0.1242) loss: 0.8076 (0.8044) time: 0.2711 data: 0.0002 max mem: 26157 Train: [45] [ 700/6250] eta: 0:28:06 lr: 0.000078 grad: 0.1021 (0.1221) loss: 0.8011 (0.8047) time: 0.2712 data: 0.0002 max mem: 26157 Train: [45] [ 800/6250] eta: 0:27:43 lr: 0.000078 grad: 0.1056 (0.1209) loss: 0.8068 (0.8047) time: 0.2731 data: 0.0002 max mem: 26157 Train: [45] [ 900/6250] eta: 0:27:15 lr: 0.000078 grad: 0.1029 (0.1194) loss: 0.8085 (0.8050) time: 0.2808 data: 0.0003 max mem: 26157 Train: [45] [1000/6250] eta: 0:26:42 lr: 0.000078 grad: 0.1062 (0.1184) loss: 0.8055 (0.8053) time: 0.2677 data: 0.0002 max mem: 26157 Train: [45] [1100/6250] eta: 0:25:55 lr: 0.000077 grad: 0.1008 (0.1174) loss: 0.8104 (0.8055) time: 0.2699 data: 0.0002 max mem: 26157 Train: [45] [1200/6250] eta: 0:25:11 lr: 0.000077 grad: 0.1175 (0.1170) loss: 0.8095 (0.8055) time: 0.2720 data: 0.0004 max mem: 26157 Train: [45] [1300/6250] eta: 0:24:30 lr: 0.000077 grad: 0.1119 (0.1171) loss: 0.8056 (0.8054) time: 0.2698 data: 0.0002 max mem: 26157 Train: [45] [1400/6250] eta: 0:23:51 lr: 0.000077 grad: 0.1087 (0.1172) loss: 0.7947 (0.8051) time: 0.2704 data: 0.0002 max mem: 26157 Train: [45] [1500/6250] eta: 0:23:13 lr: 0.000077 grad: 0.1162 (0.1171) loss: 0.8020 (0.8050) time: 0.2713 data: 0.0002 max mem: 26157 Train: [45] [1600/6250] eta: 0:22:37 lr: 0.000077 grad: 0.1118 (0.1173) loss: 0.7984 (0.8049) time: 0.2700 data: 0.0002 max mem: 26157 Train: [45] [1700/6250] eta: 0:22:02 lr: 0.000077 grad: 0.1064 (0.1171) loss: 0.8017 (0.8048) time: 0.2700 data: 0.0002 max mem: 26157 Train: [45] [1800/6250] eta: 0:21:28 lr: 0.000077 grad: 0.1089 (0.1170) loss: 0.8027 (0.8048) time: 0.2725 data: 0.0002 max mem: 26157 Train: [45] [1900/6250] eta: 0:20:55 lr: 0.000077 grad: 0.1094 (0.1168) loss: 0.8077 (0.8047) time: 0.2694 data: 0.0002 max mem: 26157 Train: [45] [2000/6250] eta: 0:20:22 lr: 0.000077 grad: 0.1163 (0.1169) loss: 0.7998 (0.8046) time: 0.2702 data: 0.0002 max mem: 26157 Train: [45] [2100/6250] eta: 0:19:50 lr: 0.000077 grad: 0.1066 (0.1168) loss: 0.7987 (0.8045) time: 0.2689 data: 0.0002 max mem: 26157 Train: [45] [2200/6250] eta: 0:19:18 lr: 0.000077 grad: 0.1131 (0.1168) loss: 0.8019 (0.8045) time: 0.2698 data: 0.0002 max mem: 26157 Train: [45] [2300/6250] eta: 0:18:46 lr: 0.000077 grad: 0.1093 (0.1170) loss: 0.8077 (0.8043) time: 0.2694 data: 0.0002 max mem: 26157 Train: [45] [2400/6250] eta: 0:18:15 lr: 0.000077 grad: 0.1074 (0.1170) loss: 0.8078 (0.8042) time: 0.2703 data: 0.0002 max mem: 26157 Train: [45] [2500/6250] eta: 0:17:44 lr: 0.000077 grad: 0.1038 (0.1173) loss: 0.8070 (0.8041) time: 0.2698 data: 0.0002 max mem: 26157 Train: [45] [2600/6250] eta: 0:17:14 lr: 0.000077 grad: 0.1064 (0.1172) loss: 0.8027 (0.8041) time: 0.2718 data: 0.0002 max mem: 26157 Train: [45] [2700/6250] eta: 0:16:49 lr: 0.000077 grad: 0.1092 (0.1173) loss: 0.8029 (0.8041) time: 0.2733 data: 0.0002 max mem: 26157 Train: [45] [2800/6250] eta: 0:16:19 lr: 0.000077 grad: 0.1084 (0.1172) loss: 0.8108 (0.8041) time: 0.2995 data: 0.0216 max mem: 26157 Train: [45] [2900/6250] eta: 0:15:49 lr: 0.000077 grad: 0.1186 (0.1172) loss: 0.7899 (0.8040) time: 0.2696 data: 0.0002 max mem: 26157 Train: [45] [3000/6250] eta: 0:15:20 lr: 0.000077 grad: 0.1167 (0.1174) loss: 0.7979 (0.8039) time: 0.2684 data: 0.0002 max mem: 26157 Train: [45] [3100/6250] eta: 0:14:55 lr: 0.000077 grad: 0.1080 (0.1172) loss: 0.8040 (0.8039) time: 0.2722 data: 0.0002 max mem: 26157 Train: [45] [3200/6250] eta: 0:14:25 lr: 0.000077 grad: 0.1061 (0.1171) loss: 0.7986 (0.8039) time: 0.2737 data: 0.0002 max mem: 26157 Train: [45] [3300/6250] eta: 0:13:55 lr: 0.000077 grad: 0.1140 (0.1174) loss: 0.8003 (0.8039) time: 0.2701 data: 0.0002 max mem: 26157 Train: [45] [3400/6250] eta: 0:13:26 lr: 0.000077 grad: 0.1078 (0.1174) loss: 0.8021 (0.8038) time: 0.2682 data: 0.0001 max mem: 26157 Train: [45] [3500/6250] eta: 0:12:56 lr: 0.000077 grad: 0.1061 (0.1172) loss: 0.8048 (0.8038) time: 0.2688 data: 0.0002 max mem: 26157 Train: [45] [3600/6250] eta: 0:12:27 lr: 0.000077 grad: 0.1110 (0.1173) loss: 0.8044 (0.8038) time: 0.2690 data: 0.0001 max mem: 26157 Train: [45] [3700/6250] eta: 0:11:58 lr: 0.000077 grad: 0.1156 (0.1173) loss: 0.8046 (0.8038) time: 0.2682 data: 0.0002 max mem: 26157 Train: [45] [3800/6250] eta: 0:11:30 lr: 0.000077 grad: 0.1156 (0.1172) loss: 0.7978 (0.8037) time: 0.2739 data: 0.0002 max mem: 26157 Train: [45] [3900/6250] eta: 0:11:01 lr: 0.000077 grad: 0.1151 (0.1172) loss: 0.7937 (0.8036) time: 0.2723 data: 0.0002 max mem: 26157 Train: [45] [4000/6250] eta: 0:10:32 lr: 0.000077 grad: 0.1127 (0.1173) loss: 0.7965 (0.8035) time: 0.2715 data: 0.0003 max mem: 26157 Train: [45] [4100/6250] eta: 0:10:04 lr: 0.000077 grad: 0.1117 (0.1173) loss: 0.8059 (0.8035) time: 0.2683 data: 0.0002 max mem: 26157 Train: [45] [4200/6250] eta: 0:09:35 lr: 0.000076 grad: 0.1109 (0.1176) loss: 0.7994 (0.8034) time: 0.2676 data: 0.0002 max mem: 26157 Train: [45] [4300/6250] eta: 0:09:06 lr: 0.000076 grad: 0.1175 (0.1175) loss: 0.7980 (0.8033) time: 0.2693 data: 0.0002 max mem: 26157 Train: [45] [4400/6250] eta: 0:08:38 lr: 0.000076 grad: 0.1109 (0.1177) loss: 0.7992 (0.8032) time: 0.2691 data: 0.0002 max mem: 26157 Train: [45] [4500/6250] eta: 0:08:09 lr: 0.000076 grad: 0.1078 (0.1178) loss: 0.8037 (0.8032) time: 0.2738 data: 0.0002 max mem: 26157 Train: [45] [4600/6250] eta: 0:07:43 lr: 0.000076 grad: 0.1198 (0.1180) loss: 0.7979 (0.8032) time: 0.2677 data: 0.0002 max mem: 26157 Train: [45] [4700/6250] eta: 0:07:14 lr: 0.000076 grad: 0.1118 (0.1181) loss: 0.8069 (0.8031) time: 0.2723 data: 0.0002 max mem: 26157 Train: [45] [4800/6250] eta: 0:06:46 lr: 0.000076 grad: 0.1241 (0.1183) loss: 0.8013 (0.8030) time: 0.2692 data: 0.0002 max mem: 26157 Train: [45] [4900/6250] eta: 0:06:18 lr: 0.000076 grad: 0.1250 (0.1185) loss: 0.8048 (0.8029) time: 0.2715 data: 0.0002 max mem: 26157 Train: [45] [5000/6250] eta: 0:05:50 lr: 0.000076 grad: 0.1136 (0.1185) loss: 0.8018 (0.8029) time: 0.2696 data: 0.0002 max mem: 26157 Train: [45] [5100/6250] eta: 0:05:22 lr: 0.000076 grad: 0.1155 (0.1184) loss: 0.8025 (0.8028) time: 0.2702 data: 0.0002 max mem: 26157 Train: [45] [5200/6250] eta: 0:04:54 lr: 0.000076 grad: 0.1163 (0.1186) loss: 0.7996 (0.8028) time: 0.2685 data: 0.0002 max mem: 26157 Train: [45] [5300/6250] eta: 0:04:26 lr: 0.000076 grad: 0.1159 (0.1187) loss: 0.8039 (0.8027) time: 0.2687 data: 0.0002 max mem: 26157 Train: [45] [5400/6250] eta: 0:03:58 lr: 0.000076 grad: 0.1072 (0.1188) loss: 0.8017 (0.8026) time: 0.2710 data: 0.0002 max mem: 26157 Train: [45] [5500/6250] eta: 0:03:30 lr: 0.000076 grad: 0.1125 (0.1190) loss: 0.8019 (0.8026) time: 0.2693 data: 0.0002 max mem: 26157 Train: [45] [5600/6250] eta: 0:03:01 lr: 0.000076 grad: 0.1138 (0.1189) loss: 0.8032 (0.8026) time: 0.2691 data: 0.0002 max mem: 26157 Train: [45] [5700/6250] eta: 0:02:33 lr: 0.000076 grad: 0.1112 (0.1189) loss: 0.8032 (0.8025) time: 0.2687 data: 0.0002 max mem: 26157 Train: [45] [5800/6250] eta: 0:02:05 lr: 0.000076 grad: 0.1156 (0.1189) loss: 0.8032 (0.8025) time: 0.2676 data: 0.0002 max mem: 26157 Train: [45] [5900/6250] eta: 0:01:37 lr: 0.000076 grad: 0.1184 (0.1189) loss: 0.8000 (0.8025) time: 0.2679 data: 0.0002 max mem: 26157 Train: [45] [6000/6250] eta: 0:01:09 lr: 0.000076 grad: 0.1125 (0.1188) loss: 0.8055 (0.8025) time: 0.2723 data: 0.0002 max mem: 26157 Train: [45] [6100/6250] eta: 0:00:41 lr: 0.000076 grad: 0.1127 (0.1192) loss: 0.8011 (0.8025) time: 0.2699 data: 0.0002 max mem: 26157 Train: [45] [6200/6250] eta: 0:00:13 lr: 0.000076 grad: 0.1185 (0.1192) loss: 0.7963 (0.8024) time: 0.2687 data: 0.0002 max mem: 26157 Train: [45] [6249/6250] eta: 0:00:00 lr: 0.000076 grad: 0.1209 (0.1193) loss: 0.7982 (0.8024) time: 0.2701 data: 0.0002 max mem: 26157 Train: [45] Total time: 0:29:11 (0.2803 s / it) Averaged stats: lr: 0.000076 grad: 0.1209 (0.1193) loss: 0.7982 (0.8024) Eval (hcp-train-subset): [45] [ 0/62] eta: 0:05:09 loss: 0.8372 (0.8372) time: 4.9903 data: 4.9063 max mem: 26157 Eval (hcp-train-subset): [45] [61/62] eta: 0:00:00 loss: 0.8221 (0.8250) time: 0.1300 data: 0.0473 max mem: 26157 Eval (hcp-train-subset): [45] Total time: 0:00:14 (0.2330 s / it) Averaged stats (hcp-train-subset): loss: 0.8221 (0.8250) Making plots (hcp-train-subset): example=8 Eval (hcp-val): [45] [ 0/62] eta: 0:04:48 loss: 0.8210 (0.8210) time: 4.6591 data: 4.5741 max mem: 26157 Eval (hcp-val): [45] [61/62] eta: 0:00:00 loss: 0.8243 (0.8258) time: 0.1366 data: 0.0532 max mem: 26157 Eval (hcp-val): [45] Total time: 0:00:13 (0.2208 s / it) Averaged stats (hcp-val): loss: 0.8243 (0.8258) Making plots (hcp-val): example=27 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [46] [ 0/6250] eta: 11:06:54 lr: 0.000076 grad: 0.3525 (0.3525) loss: 0.8067 (0.8067) time: 6.4024 data: 6.1155 max mem: 26157 Train: [46] [ 100/6250] eta: 0:34:29 lr: 0.000076 grad: 0.1238 (0.1352) loss: 0.8213 (0.8261) time: 0.2698 data: 0.0002 max mem: 26157 Train: [46] [ 200/6250] eta: 0:30:32 lr: 0.000076 grad: 0.1084 (0.1298) loss: 0.8104 (0.8171) time: 0.2693 data: 0.0002 max mem: 26157 Train: [46] [ 300/6250] eta: 0:28:55 lr: 0.000076 grad: 0.1113 (0.1253) loss: 0.8112 (0.8131) time: 0.2685 data: 0.0002 max mem: 26157 Train: [46] [ 400/6250] eta: 0:27:52 lr: 0.000076 grad: 0.1011 (0.1207) loss: 0.8111 (0.8122) time: 0.2671 data: 0.0001 max mem: 26157 Train: [46] [ 500/6250] eta: 0:27:06 lr: 0.000076 grad: 0.1088 (0.1188) loss: 0.8140 (0.8115) time: 0.2713 data: 0.0002 max mem: 26157 Train: [46] [ 600/6250] eta: 0:26:26 lr: 0.000076 grad: 0.1102 (0.1173) loss: 0.8137 (0.8110) time: 0.2695 data: 0.0002 max mem: 26157 Train: [46] [ 700/6250] eta: 0:25:52 lr: 0.000076 grad: 0.1055 (0.1167) loss: 0.8110 (0.8106) time: 0.2742 data: 0.0002 max mem: 26157 Train: [46] [ 800/6250] eta: 0:25:19 lr: 0.000076 grad: 0.1040 (0.1162) loss: 0.8160 (0.8102) time: 0.2740 data: 0.0002 max mem: 26157 Train: [46] [ 900/6250] eta: 0:24:47 lr: 0.000076 grad: 0.1067 (0.1157) loss: 0.8044 (0.8101) time: 0.2716 data: 0.0002 max mem: 26157 Train: [46] [1000/6250] eta: 0:24:16 lr: 0.000076 grad: 0.1022 (0.1152) loss: 0.8085 (0.8100) time: 0.2727 data: 0.0002 max mem: 26157 Train: [46] [1100/6250] eta: 0:23:44 lr: 0.000075 grad: 0.1091 (0.1149) loss: 0.8007 (0.8094) time: 0.2685 data: 0.0002 max mem: 26157 Train: [46] [1200/6250] eta: 0:23:14 lr: 0.000075 grad: 0.1100 (0.1149) loss: 0.8043 (0.8088) time: 0.2716 data: 0.0002 max mem: 26157 Train: [46] [1300/6250] eta: 0:22:44 lr: 0.000075 grad: 0.1085 (0.1150) loss: 0.8045 (0.8084) time: 0.2691 data: 0.0002 max mem: 26157 Train: [46] [1400/6250] eta: 0:22:30 lr: 0.000075 grad: 0.1084 (0.1151) loss: 0.8033 (0.8079) time: 0.3216 data: 0.0435 max mem: 26157 Train: [46] [1500/6250] eta: 0:22:00 lr: 0.000075 grad: 0.1087 (0.1155) loss: 0.7983 (0.8076) time: 0.2724 data: 0.0002 max mem: 26157 Train: [46] [1600/6250] eta: 0:21:30 lr: 0.000075 grad: 0.1091 (0.1156) loss: 0.8077 (0.8074) time: 0.2690 data: 0.0002 max mem: 26157 Train: [46] [1700/6250] eta: 0:21:00 lr: 0.000075 grad: 0.1062 (0.1161) loss: 0.8106 (0.8073) time: 0.2757 data: 0.0002 max mem: 26157 Train: [46] [1800/6250] eta: 0:20:31 lr: 0.000075 grad: 0.1077 (0.1162) loss: 0.8042 (0.8072) time: 0.2687 data: 0.0002 max mem: 26157 Train: [46] [1900/6250] eta: 0:20:01 lr: 0.000075 grad: 0.1116 (0.1161) loss: 0.8101 (0.8071) time: 0.2722 data: 0.0002 max mem: 26157 Train: [46] [2000/6250] eta: 0:19:41 lr: 0.000075 grad: 0.1149 (0.1160) loss: 0.8062 (0.8069) time: 0.4551 data: 0.1782 max mem: 26157 Train: [46] [2100/6250] eta: 0:19:17 lr: 0.000075 grad: 0.1159 (0.1163) loss: 0.8054 (0.8067) time: 0.2691 data: 0.0002 max mem: 26157 Train: [46] [2200/6250] eta: 0:18:48 lr: 0.000075 grad: 0.1123 (0.1164) loss: 0.8021 (0.8066) time: 0.2692 data: 0.0002 max mem: 26157 Train: [46] [2300/6250] eta: 0:18:19 lr: 0.000075 grad: 0.1082 (0.1166) loss: 0.8123 (0.8064) time: 0.2683 data: 0.0003 max mem: 26157 Train: [46] [2400/6250] eta: 0:17:49 lr: 0.000075 grad: 0.1067 (0.1170) loss: 0.8076 (0.8063) time: 0.2690 data: 0.0002 max mem: 26157 Train: [46] [2500/6250] eta: 0:17:20 lr: 0.000075 grad: 0.1048 (0.1172) loss: 0.8079 (0.8063) time: 0.2688 data: 0.0002 max mem: 26157 Train: [46] [2600/6250] eta: 0:16:51 lr: 0.000075 grad: 0.1131 (0.1171) loss: 0.8008 (0.8063) time: 0.2711 data: 0.0002 max mem: 26157 Train: [46] [2700/6250] eta: 0:16:23 lr: 0.000075 grad: 0.1151 (0.1171) loss: 0.8053 (0.8063) time: 0.2701 data: 0.0002 max mem: 26157 Train: [46] [2800/6250] eta: 0:15:54 lr: 0.000075 grad: 0.1090 (0.1171) loss: 0.8030 (0.8062) time: 0.2686 data: 0.0002 max mem: 26157 Train: [46] [2900/6250] eta: 0:15:26 lr: 0.000075 grad: 0.1062 (0.1170) loss: 0.8084 (0.8062) time: 0.2692 data: 0.0002 max mem: 26157 Train: [46] [3000/6250] eta: 0:14:57 lr: 0.000075 grad: 0.1091 (0.1171) loss: 0.8087 (0.8061) time: 0.2705 data: 0.0002 max mem: 26157 Train: [46] [3100/6250] eta: 0:14:29 lr: 0.000075 grad: 0.1028 (0.1169) loss: 0.8095 (0.8062) time: 0.2689 data: 0.0002 max mem: 26157 Train: [46] [3200/6250] eta: 0:14:01 lr: 0.000075 grad: 0.1067 (0.1168) loss: 0.8078 (0.8062) time: 0.2693 data: 0.0002 max mem: 26157 Train: [46] [3300/6250] eta: 0:13:33 lr: 0.000075 grad: 0.1053 (0.1167) loss: 0.8097 (0.8062) time: 0.2726 data: 0.0002 max mem: 26157 Train: [46] [3400/6250] eta: 0:13:05 lr: 0.000075 grad: 0.1084 (0.1166) loss: 0.8045 (0.8061) time: 0.2690 data: 0.0002 max mem: 26157 Train: [46] [3500/6250] eta: 0:12:40 lr: 0.000075 grad: 0.1146 (0.1166) loss: 0.7969 (0.8060) time: 0.2703 data: 0.0002 max mem: 26157 Train: [46] [3600/6250] eta: 0:12:12 lr: 0.000075 grad: 0.1115 (0.1165) loss: 0.8060 (0.8060) time: 0.2707 data: 0.0002 max mem: 26157 Train: [46] [3700/6250] eta: 0:11:46 lr: 0.000075 grad: 0.1075 (0.1166) loss: 0.8038 (0.8059) time: 0.2734 data: 0.0002 max mem: 26157 Train: [46] [3800/6250] eta: 0:11:18 lr: 0.000075 grad: 0.1255 (0.1168) loss: 0.8081 (0.8058) time: 0.2791 data: 0.0002 max mem: 26157 Train: [46] [3900/6250] eta: 0:10:51 lr: 0.000075 grad: 0.1104 (0.1169) loss: 0.7979 (0.8057) time: 0.2683 data: 0.0002 max mem: 26157 Train: [46] [4000/6250] eta: 0:10:23 lr: 0.000075 grad: 0.1072 (0.1168) loss: 0.8116 (0.8057) time: 0.2680 data: 0.0001 max mem: 26157 Train: [46] [4100/6250] eta: 0:09:55 lr: 0.000075 grad: 0.1124 (0.1169) loss: 0.7993 (0.8057) time: 0.2715 data: 0.0002 max mem: 26157 Train: [46] [4200/6250] eta: 0:09:27 lr: 0.000074 grad: 0.1110 (0.1169) loss: 0.8111 (0.8057) time: 0.2723 data: 0.0002 max mem: 26157 Train: [46] [4300/6250] eta: 0:08:59 lr: 0.000074 grad: 0.1171 (0.1170) loss: 0.7983 (0.8056) time: 0.2687 data: 0.0002 max mem: 26157 Train: [46] [4400/6250] eta: 0:08:31 lr: 0.000074 grad: 0.1097 (0.1170) loss: 0.8025 (0.8055) time: 0.2688 data: 0.0002 max mem: 26157 Train: [46] [4500/6250] eta: 0:08:06 lr: 0.000074 grad: 0.1244 (0.1172) loss: 0.8015 (0.8054) time: 0.7281 data: 0.4553 max mem: 26157 Train: [46] [4600/6250] eta: 0:07:38 lr: 0.000074 grad: 0.1210 (0.1173) loss: 0.7976 (0.8053) time: 0.2714 data: 0.0002 max mem: 26157 Train: [46] [4700/6250] eta: 0:07:11 lr: 0.000074 grad: 0.1178 (0.1175) loss: 0.8002 (0.8052) time: 0.2708 data: 0.0002 max mem: 26157 Train: [46] [4800/6250] eta: 0:06:44 lr: 0.000074 grad: 0.1133 (0.1176) loss: 0.8041 (0.8051) time: 0.3050 data: 0.0337 max mem: 26157 Train: [46] [4900/6250] eta: 0:06:16 lr: 0.000074 grad: 0.1158 (0.1179) loss: 0.8005 (0.8049) time: 0.2695 data: 0.0001 max mem: 26157 Train: [46] [5000/6250] eta: 0:05:48 lr: 0.000074 grad: 0.1148 (0.1179) loss: 0.7949 (0.8048) time: 0.2687 data: 0.0002 max mem: 26157 Train: [46] [5100/6250] eta: 0:05:19 lr: 0.000074 grad: 0.1184 (0.1181) loss: 0.8027 (0.8047) time: 0.2687 data: 0.0002 max mem: 26157 Train: [46] [5200/6250] eta: 0:04:51 lr: 0.000074 grad: 0.1201 (0.1182) loss: 0.7907 (0.8045) time: 0.2686 data: 0.0002 max mem: 26157 Train: [46] [5300/6250] eta: 0:04:24 lr: 0.000074 grad: 0.1110 (0.1184) loss: 0.8023 (0.8044) time: 0.2722 data: 0.0002 max mem: 26157 Train: [46] [5400/6250] eta: 0:03:57 lr: 0.000074 grad: 0.1176 (0.1185) loss: 0.7986 (0.8043) time: 0.2714 data: 0.0002 max mem: 26157 Train: [46] [5500/6250] eta: 0:03:29 lr: 0.000074 grad: 0.1189 (0.1187) loss: 0.8014 (0.8041) time: 0.2680 data: 0.0002 max mem: 26157 Train: [46] [5600/6250] eta: 0:03:01 lr: 0.000074 grad: 0.1219 (0.1189) loss: 0.7963 (0.8040) time: 0.2695 data: 0.0002 max mem: 26157 Train: [46] [5700/6250] eta: 0:02:33 lr: 0.000074 grad: 0.1223 (0.1190) loss: 0.7947 (0.8039) time: 0.2689 data: 0.0002 max mem: 26157 Train: [46] [5800/6250] eta: 0:02:05 lr: 0.000074 grad: 0.1151 (0.1191) loss: 0.8033 (0.8038) time: 0.2693 data: 0.0002 max mem: 26157 Train: [46] [5900/6250] eta: 0:01:37 lr: 0.000074 grad: 0.1030 (0.1191) loss: 0.8054 (0.8038) time: 0.2679 data: 0.0002 max mem: 26157 Train: [46] [6000/6250] eta: 0:01:09 lr: 0.000074 grad: 0.1134 (0.1191) loss: 0.7998 (0.8037) time: 0.2699 data: 0.0002 max mem: 26157 Train: [46] [6100/6250] eta: 0:00:41 lr: 0.000074 grad: 0.1193 (0.1191) loss: 0.8012 (0.8037) time: 0.2717 data: 0.0002 max mem: 26157 Train: [46] [6200/6250] eta: 0:00:13 lr: 0.000074 grad: 0.1143 (0.1192) loss: 0.7982 (0.8037) time: 0.2697 data: 0.0002 max mem: 26157 Train: [46] [6249/6250] eta: 0:00:00 lr: 0.000074 grad: 0.1150 (0.1192) loss: 0.8001 (0.8037) time: 0.2704 data: 0.0002 max mem: 26157 Train: [46] Total time: 0:29:12 (0.2804 s / it) Averaged stats: lr: 0.000074 grad: 0.1150 (0.1192) loss: 0.8001 (0.8037) Eval (hcp-train-subset): [46] [ 0/62] eta: 0:04:50 loss: 0.8340 (0.8340) time: 4.6825 data: 4.5975 max mem: 26157 Eval (hcp-train-subset): [46] [61/62] eta: 0:00:00 loss: 0.8229 (0.8251) time: 0.1299 data: 0.0467 max mem: 26157 Eval (hcp-train-subset): [46] Total time: 0:00:13 (0.2115 s / it) Averaged stats (hcp-train-subset): loss: 0.8229 (0.8251) Making plots (hcp-train-subset): example=17 Eval (hcp-val): [46] [ 0/62] eta: 0:05:53 loss: 0.8203 (0.8203) time: 5.7096 data: 5.6253 max mem: 26157 Eval (hcp-val): [46] [61/62] eta: 0:00:00 loss: 0.8251 (0.8257) time: 0.1276 data: 0.0451 max mem: 26157 Eval (hcp-val): [46] Total time: 0:00:13 (0.2172 s / it) Averaged stats (hcp-val): loss: 0.8251 (0.8257) Making plots (hcp-val): example=22 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [47] [ 0/6250] eta: 11:48:05 lr: 0.000074 grad: 0.1680 (0.1680) loss: 0.8491 (0.8491) time: 6.7977 data: 6.5155 max mem: 26157 Train: [47] [ 100/6250] eta: 0:34:33 lr: 0.000074 grad: 0.1070 (0.1464) loss: 0.8114 (0.8145) time: 0.2688 data: 0.0002 max mem: 26157 Train: [47] [ 200/6250] eta: 0:30:35 lr: 0.000074 grad: 0.1067 (0.1335) loss: 0.8087 (0.8105) time: 0.2687 data: 0.0002 max mem: 26157 Train: [47] [ 300/6250] eta: 0:28:58 lr: 0.000074 grad: 0.1160 (0.1294) loss: 0.7979 (0.8082) time: 0.2699 data: 0.0002 max mem: 26157 Train: [47] [ 400/6250] eta: 0:27:56 lr: 0.000074 grad: 0.1159 (0.1281) loss: 0.8007 (0.8066) time: 0.2691 data: 0.0002 max mem: 26157 Train: [47] [ 500/6250] eta: 0:27:08 lr: 0.000074 grad: 0.1116 (0.1262) loss: 0.8000 (0.8061) time: 0.2691 data: 0.0002 max mem: 26157 Train: [47] [ 600/6250] eta: 0:26:27 lr: 0.000074 grad: 0.1107 (0.1254) loss: 0.8059 (0.8057) time: 0.2680 data: 0.0002 max mem: 26157 Train: [47] [ 700/6250] eta: 0:25:51 lr: 0.000074 grad: 0.1051 (0.1239) loss: 0.7996 (0.8054) time: 0.2708 data: 0.0002 max mem: 26157 Train: [47] [ 800/6250] eta: 0:25:19 lr: 0.000074 grad: 0.1071 (0.1222) loss: 0.8064 (0.8053) time: 0.2714 data: 0.0002 max mem: 26157 Train: [47] [ 900/6250] eta: 0:24:54 lr: 0.000074 grad: 0.1110 (0.1222) loss: 0.8060 (0.8053) time: 0.2706 data: 0.0002 max mem: 26157 Train: [47] [1000/6250] eta: 0:24:36 lr: 0.000073 grad: 0.1116 (0.1211) loss: 0.8008 (0.8054) time: 0.2711 data: 0.0002 max mem: 26157 Train: [47] [1100/6250] eta: 0:24:07 lr: 0.000073 grad: 0.1104 (0.1207) loss: 0.8005 (0.8053) time: 0.2707 data: 0.0001 max mem: 26157 Train: [47] [1200/6250] eta: 0:23:34 lr: 0.000073 grad: 0.1133 (0.1204) loss: 0.8034 (0.8051) time: 0.2676 data: 0.0002 max mem: 26157 Train: [47] [1300/6250] eta: 0:23:06 lr: 0.000073 grad: 0.1196 (0.1204) loss: 0.8010 (0.8050) time: 0.2703 data: 0.0002 max mem: 26157 Train: [47] [1400/6250] eta: 0:22:35 lr: 0.000073 grad: 0.1043 (0.1201) loss: 0.8058 (0.8049) time: 0.2683 data: 0.0001 max mem: 26157 Train: [47] [1500/6250] eta: 0:22:03 lr: 0.000073 grad: 0.1151 (0.1201) loss: 0.8017 (0.8048) time: 0.2691 data: 0.0002 max mem: 26157 Train: [47] [1600/6250] eta: 0:21:33 lr: 0.000073 grad: 0.1184 (0.1196) loss: 0.7980 (0.8047) time: 0.2714 data: 0.0002 max mem: 26157 Train: [47] [1700/6250] eta: 0:21:03 lr: 0.000073 grad: 0.1147 (0.1196) loss: 0.7999 (0.8047) time: 0.2728 data: 0.0002 max mem: 26157 Train: [47] [1800/6250] eta: 0:20:34 lr: 0.000073 grad: 0.1098 (0.1198) loss: 0.8053 (0.8047) time: 0.2707 data: 0.0002 max mem: 26157 Train: [47] [1900/6250] eta: 0:20:04 lr: 0.000073 grad: 0.1140 (0.1198) loss: 0.7985 (0.8045) time: 0.2712 data: 0.0002 max mem: 26157 Train: [47] [2000/6250] eta: 0:19:35 lr: 0.000073 grad: 0.1140 (0.1201) loss: 0.8022 (0.8044) time: 0.2690 data: 0.0002 max mem: 26157 Train: [47] [2100/6250] eta: 0:19:06 lr: 0.000073 grad: 0.1170 (0.1202) loss: 0.7980 (0.8043) time: 0.2691 data: 0.0002 max mem: 26157 Train: [47] [2200/6250] eta: 0:18:37 lr: 0.000073 grad: 0.1069 (0.1199) loss: 0.8031 (0.8041) time: 0.2699 data: 0.0002 max mem: 26157 Train: [47] [2300/6250] eta: 0:18:08 lr: 0.000073 grad: 0.1072 (0.1197) loss: 0.7999 (0.8040) time: 0.2710 data: 0.0002 max mem: 26157 Train: [47] [2400/6250] eta: 0:17:40 lr: 0.000073 grad: 0.1153 (0.1195) loss: 0.8049 (0.8040) time: 0.2690 data: 0.0002 max mem: 26157 Train: [47] [2500/6250] eta: 0:17:11 lr: 0.000073 grad: 0.1131 (0.1197) loss: 0.8022 (0.8039) time: 0.2719 data: 0.0002 max mem: 26157 Train: [47] [2600/6250] eta: 0:16:43 lr: 0.000073 grad: 0.1052 (0.1196) loss: 0.8098 (0.8039) time: 0.2717 data: 0.0002 max mem: 26157 Train: [47] [2700/6250] eta: 0:16:23 lr: 0.000073 grad: 0.1119 (0.1196) loss: 0.8082 (0.8040) time: 0.2692 data: 0.0002 max mem: 26157 Train: [47] [2800/6250] eta: 0:15:55 lr: 0.000073 grad: 0.1119 (0.1198) loss: 0.8053 (0.8040) time: 0.2707 data: 0.0002 max mem: 26157 Train: [47] [2900/6250] eta: 0:15:36 lr: 0.000073 grad: 0.1169 (0.1199) loss: 0.8032 (0.8040) time: 0.2692 data: 0.0002 max mem: 26157 Train: [47] [3000/6250] eta: 0:15:07 lr: 0.000073 grad: 0.1168 (0.1200) loss: 0.7962 (0.8039) time: 0.2693 data: 0.0002 max mem: 26157 Train: [47] [3100/6250] eta: 0:14:40 lr: 0.000073 grad: 0.1143 (0.1200) loss: 0.7980 (0.8039) time: 0.2697 data: 0.0002 max mem: 26157 Train: [47] [3200/6250] eta: 0:14:12 lr: 0.000073 grad: 0.1187 (0.1202) loss: 0.7979 (0.8038) time: 0.2719 data: 0.0002 max mem: 26157 Train: [47] [3300/6250] eta: 0:13:43 lr: 0.000073 grad: 0.1179 (0.1203) loss: 0.7942 (0.8037) time: 0.2692 data: 0.0002 max mem: 26157 Train: [47] [3400/6250] eta: 0:13:15 lr: 0.000073 grad: 0.1165 (0.1204) loss: 0.8002 (0.8035) time: 0.2706 data: 0.0002 max mem: 26157 Train: [47] [3500/6250] eta: 0:12:46 lr: 0.000073 grad: 0.1131 (0.1206) loss: 0.8016 (0.8034) time: 0.2721 data: 0.0002 max mem: 26157 Train: [47] [3600/6250] eta: 0:12:18 lr: 0.000073 grad: 0.1206 (0.1208) loss: 0.7981 (0.8033) time: 0.2687 data: 0.0002 max mem: 26157 Train: [47] [3700/6250] eta: 0:11:49 lr: 0.000073 grad: 0.1222 (0.1211) loss: 0.8026 (0.8032) time: 0.2688 data: 0.0002 max mem: 26157 Train: [47] [3800/6250] eta: 0:11:21 lr: 0.000073 grad: 0.1192 (0.1214) loss: 0.7998 (0.8030) time: 0.2699 data: 0.0002 max mem: 26157 Train: [47] [3900/6250] eta: 0:10:52 lr: 0.000073 grad: 0.1218 (0.1217) loss: 0.7996 (0.8029) time: 0.2714 data: 0.0002 max mem: 26157 Train: [47] [4000/6250] eta: 0:10:24 lr: 0.000073 grad: 0.1167 (0.1218) loss: 0.7941 (0.8028) time: 0.2692 data: 0.0002 max mem: 26157 Train: [47] [4100/6250] eta: 0:09:56 lr: 0.000072 grad: 0.1296 (0.1221) loss: 0.7934 (0.8026) time: 0.2718 data: 0.0002 max mem: 26157 Train: [47] [4200/6250] eta: 0:09:28 lr: 0.000072 grad: 0.1144 (0.1222) loss: 0.7925 (0.8025) time: 0.2712 data: 0.0002 max mem: 26157 Train: [47] [4300/6250] eta: 0:09:00 lr: 0.000072 grad: 0.1268 (0.1223) loss: 0.7948 (0.8023) time: 0.2692 data: 0.0002 max mem: 26157 Train: [47] [4400/6250] eta: 0:08:32 lr: 0.000072 grad: 0.1210 (0.1225) loss: 0.7977 (0.8022) time: 0.2722 data: 0.0002 max mem: 26157 Train: [47] [4500/6250] eta: 0:08:04 lr: 0.000072 grad: 0.1346 (0.1228) loss: 0.7913 (0.8021) time: 0.2723 data: 0.0002 max mem: 26157 Train: [47] [4600/6250] eta: 0:07:36 lr: 0.000072 grad: 0.1175 (0.1230) loss: 0.8010 (0.8020) time: 0.2694 data: 0.0002 max mem: 26157 Train: [47] [4700/6250] eta: 0:07:08 lr: 0.000072 grad: 0.1213 (0.1231) loss: 0.8043 (0.8019) time: 0.2706 data: 0.0002 max mem: 26157 Train: [47] [4800/6250] eta: 0:06:40 lr: 0.000072 grad: 0.1200 (0.1231) loss: 0.8036 (0.8019) time: 0.2685 data: 0.0002 max mem: 26157 Train: [47] [4900/6250] eta: 0:06:12 lr: 0.000072 grad: 0.1151 (0.1232) loss: 0.8000 (0.8019) time: 0.2706 data: 0.0002 max mem: 26157 Train: [47] [5000/6250] eta: 0:05:45 lr: 0.000072 grad: 0.1141 (0.1232) loss: 0.8023 (0.8019) time: 0.2677 data: 0.0001 max mem: 26157 Train: [47] [5100/6250] eta: 0:05:17 lr: 0.000072 grad: 0.1281 (0.1233) loss: 0.8007 (0.8019) time: 0.2691 data: 0.0002 max mem: 26157 Train: [47] [5200/6250] eta: 0:04:49 lr: 0.000072 grad: 0.1175 (0.1233) loss: 0.8062 (0.8019) time: 0.2722 data: 0.0002 max mem: 26157 Train: [47] [5300/6250] eta: 0:04:22 lr: 0.000072 grad: 0.1233 (0.1233) loss: 0.8011 (0.8019) time: 0.2724 data: 0.0002 max mem: 26157 Train: [47] [5400/6250] eta: 0:03:54 lr: 0.000072 grad: 0.1260 (0.1234) loss: 0.8084 (0.8019) time: 0.2688 data: 0.0002 max mem: 26157 Train: [47] [5500/6250] eta: 0:03:26 lr: 0.000072 grad: 0.1146 (0.1235) loss: 0.8068 (0.8019) time: 0.2691 data: 0.0002 max mem: 26157 Train: [47] [5600/6250] eta: 0:02:59 lr: 0.000072 grad: 0.1218 (0.1236) loss: 0.8038 (0.8019) time: 0.2681 data: 0.0002 max mem: 26157 Train: [47] [5700/6250] eta: 0:02:31 lr: 0.000072 grad: 0.1421 (0.1238) loss: 0.8003 (0.8019) time: 0.2706 data: 0.0001 max mem: 26157 Train: [47] [5800/6250] eta: 0:02:03 lr: 0.000072 grad: 0.1229 (0.1240) loss: 0.8001 (0.8020) time: 0.2684 data: 0.0002 max mem: 26157 Train: [47] [5900/6250] eta: 0:01:36 lr: 0.000072 grad: 0.1288 (0.1243) loss: 0.8003 (0.8019) time: 0.2693 data: 0.0002 max mem: 26157 Train: [47] [6000/6250] eta: 0:01:08 lr: 0.000072 grad: 0.1288 (0.1244) loss: 0.8041 (0.8020) time: 0.2709 data: 0.0002 max mem: 26157 Train: [47] [6100/6250] eta: 0:00:41 lr: 0.000072 grad: 0.1198 (0.1245) loss: 0.7982 (0.8020) time: 0.2727 data: 0.0002 max mem: 26157 Train: [47] [6200/6250] eta: 0:00:13 lr: 0.000072 grad: 0.1109 (0.1247) loss: 0.8023 (0.8020) time: 0.2683 data: 0.0001 max mem: 26157 Train: [47] [6249/6250] eta: 0:00:00 lr: 0.000072 grad: 0.1141 (0.1247) loss: 0.7994 (0.8020) time: 0.2681 data: 0.0001 max mem: 26157 Train: [47] Total time: 0:28:44 (0.2759 s / it) Averaged stats: lr: 0.000072 grad: 0.1141 (0.1247) loss: 0.7994 (0.8020) Eval (hcp-train-subset): [47] [ 0/62] eta: 0:04:07 loss: 0.8321 (0.8321) time: 3.9885 data: 3.8596 max mem: 26157 Eval (hcp-train-subset): [47] [61/62] eta: 0:00:00 loss: 0.8242 (0.8252) time: 0.1483 data: 0.0639 max mem: 26157 Eval (hcp-train-subset): [47] Total time: 0:00:13 (0.2213 s / it) Averaged stats (hcp-train-subset): loss: 0.8242 (0.8252) Making plots (hcp-train-subset): example=49 Eval (hcp-val): [47] [ 0/62] eta: 0:03:56 loss: 0.8231 (0.8231) time: 3.8209 data: 3.7141 max mem: 26157 Eval (hcp-val): [47] [61/62] eta: 0:00:00 loss: 0.8249 (0.8257) time: 0.1304 data: 0.0475 max mem: 26157 Eval (hcp-val): [47] Total time: 0:00:13 (0.2121 s / it) Averaged stats (hcp-val): loss: 0.8249 (0.8257) Making plots (hcp-val): example=28 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [48] [ 0/6250] eta: 11:24:22 lr: 0.000072 grad: 0.3552 (0.3552) loss: 0.8059 (0.8059) time: 6.5699 data: 6.2852 max mem: 26157 Train: [48] [ 100/6250] eta: 0:34:08 lr: 0.000072 grad: 0.1140 (0.1467) loss: 0.8172 (0.8152) time: 0.2718 data: 0.0002 max mem: 26157 Train: [48] [ 200/6250] eta: 0:30:22 lr: 0.000072 grad: 0.1234 (0.1360) loss: 0.7981 (0.8104) time: 0.2694 data: 0.0002 max mem: 26157 Train: [48] [ 300/6250] eta: 0:28:53 lr: 0.000072 grad: 0.1242 (0.1334) loss: 0.7985 (0.8075) time: 0.2691 data: 0.0002 max mem: 26157 Train: [48] [ 400/6250] eta: 0:27:52 lr: 0.000072 grad: 0.1189 (0.1324) loss: 0.7937 (0.8048) time: 0.2689 data: 0.0002 max mem: 26157 Train: [48] [ 500/6250] eta: 0:27:04 lr: 0.000072 grad: 0.1126 (0.1318) loss: 0.8052 (0.8039) time: 0.2717 data: 0.0002 max mem: 26157 Train: [48] [ 600/6250] eta: 0:26:25 lr: 0.000072 grad: 0.1083 (0.1317) loss: 0.7898 (0.8026) time: 0.2722 data: 0.0002 max mem: 26157 Train: [48] [ 700/6250] eta: 0:25:50 lr: 0.000072 grad: 0.1104 (0.1304) loss: 0.8038 (0.8022) time: 0.2730 data: 0.0002 max mem: 26157 Train: [48] [ 800/6250] eta: 0:25:16 lr: 0.000072 grad: 0.1072 (0.1292) loss: 0.7950 (0.8018) time: 0.2712 data: 0.0002 max mem: 26157 Train: [48] [ 900/6250] eta: 0:25:38 lr: 0.000071 grad: 0.1173 (0.1277) loss: 0.8038 (0.8015) time: 0.2714 data: 0.0002 max mem: 26157 Train: [48] [1000/6250] eta: 0:25:12 lr: 0.000071 grad: 0.1251 (0.1281) loss: 0.8026 (0.8012) time: 0.2696 data: 0.0002 max mem: 26157 Train: [48] [1100/6250] eta: 0:24:35 lr: 0.000071 grad: 0.1160 (0.1283) loss: 0.8017 (0.8011) time: 0.2687 data: 0.0002 max mem: 26157 Train: [48] [1200/6250] eta: 0:24:00 lr: 0.000071 grad: 0.1133 (0.1278) loss: 0.7895 (0.8008) time: 0.2677 data: 0.0002 max mem: 26157 Train: [48] [1300/6250] eta: 0:23:25 lr: 0.000071 grad: 0.1128 (0.1272) loss: 0.7999 (0.8006) time: 0.2679 data: 0.0002 max mem: 26157 Train: [48] [1400/6250] eta: 0:22:52 lr: 0.000071 grad: 0.1117 (0.1268) loss: 0.7951 (0.8004) time: 0.2690 data: 0.0002 max mem: 26157 Train: [48] [1500/6250] eta: 0:22:19 lr: 0.000071 grad: 0.1183 (0.1266) loss: 0.8026 (0.8004) time: 0.2692 data: 0.0002 max mem: 26157 Train: [48] [1600/6250] eta: 0:21:48 lr: 0.000071 grad: 0.1139 (0.1262) loss: 0.7983 (0.8003) time: 0.2676 data: 0.0002 max mem: 26157 Train: [48] [1700/6250] eta: 0:21:16 lr: 0.000071 grad: 0.1217 (0.1263) loss: 0.8035 (0.8002) time: 0.2688 data: 0.0002 max mem: 26157 Train: [48] [1800/6250] eta: 0:20:45 lr: 0.000071 grad: 0.1177 (0.1267) loss: 0.7926 (0.8002) time: 0.2693 data: 0.0002 max mem: 26157 Train: [48] [1900/6250] eta: 0:20:15 lr: 0.000071 grad: 0.1226 (0.1264) loss: 0.8071 (0.8002) time: 0.2689 data: 0.0002 max mem: 26157 Train: [48] [2000/6250] eta: 0:19:45 lr: 0.000071 grad: 0.1191 (0.1264) loss: 0.8015 (0.8001) time: 0.2711 data: 0.0002 max mem: 26157 Train: [48] [2100/6250] eta: 0:19:16 lr: 0.000071 grad: 0.1270 (0.1265) loss: 0.8013 (0.8001) time: 0.2698 data: 0.0002 max mem: 26157 Train: [48] [2200/6250] eta: 0:18:46 lr: 0.000071 grad: 0.1199 (0.1264) loss: 0.7970 (0.8001) time: 0.2711 data: 0.0002 max mem: 26157 Train: [48] [2300/6250] eta: 0:18:17 lr: 0.000071 grad: 0.1146 (0.1264) loss: 0.7995 (0.8000) time: 0.2683 data: 0.0002 max mem: 26157 Train: [48] [2400/6250] eta: 0:17:50 lr: 0.000071 grad: 0.1196 (0.1265) loss: 0.8017 (0.7999) time: 0.2688 data: 0.0002 max mem: 26157 Train: [48] [2500/6250] eta: 0:17:22 lr: 0.000071 grad: 0.1311 (0.1267) loss: 0.8026 (0.7998) time: 0.2741 data: 0.0002 max mem: 26157 Train: [48] [2600/6250] eta: 0:16:53 lr: 0.000071 grad: 0.1224 (0.1268) loss: 0.7961 (0.7997) time: 0.2710 data: 0.0002 max mem: 26157 Train: [48] [2700/6250] eta: 0:16:25 lr: 0.000071 grad: 0.1197 (0.1268) loss: 0.8063 (0.7997) time: 0.2680 data: 0.0001 max mem: 26157 Train: [48] [2800/6250] eta: 0:15:56 lr: 0.000071 grad: 0.1177 (0.1269) loss: 0.8037 (0.7998) time: 0.2685 data: 0.0002 max mem: 26157 Train: [48] [2900/6250] eta: 0:15:28 lr: 0.000071 grad: 0.1184 (0.1269) loss: 0.8018 (0.7997) time: 0.2683 data: 0.0001 max mem: 26157 Train: [48] [3000/6250] eta: 0:14:59 lr: 0.000071 grad: 0.1203 (0.1267) loss: 0.8047 (0.7999) time: 0.2680 data: 0.0002 max mem: 26157 Train: [48] [3100/6250] eta: 0:14:31 lr: 0.000071 grad: 0.1188 (0.1267) loss: 0.8018 (0.7999) time: 0.2684 data: 0.0002 max mem: 26157 Train: [48] [3200/6250] eta: 0:14:02 lr: 0.000071 grad: 0.1170 (0.1268) loss: 0.8039 (0.8000) time: 0.2696 data: 0.0002 max mem: 26157 Train: [48] [3300/6250] eta: 0:13:34 lr: 0.000071 grad: 0.1227 (0.1268) loss: 0.7974 (0.8000) time: 0.2686 data: 0.0002 max mem: 26157 Train: [48] [3400/6250] eta: 0:13:06 lr: 0.000071 grad: 0.1175 (0.1267) loss: 0.8033 (0.8000) time: 0.2689 data: 0.0002 max mem: 26157 Train: [48] [3500/6250] eta: 0:12:38 lr: 0.000071 grad: 0.1247 (0.1265) loss: 0.8045 (0.8000) time: 0.2688 data: 0.0002 max mem: 26157 Train: [48] [3600/6250] eta: 0:12:10 lr: 0.000071 grad: 0.1130 (0.1265) loss: 0.8000 (0.8000) time: 0.2677 data: 0.0001 max mem: 26157 Train: [48] [3700/6250] eta: 0:11:42 lr: 0.000071 grad: 0.1176 (0.1264) loss: 0.7928 (0.7999) time: 0.2677 data: 0.0002 max mem: 26157 Train: [48] [3800/6250] eta: 0:11:14 lr: 0.000071 grad: 0.1204 (0.1264) loss: 0.7982 (0.7999) time: 0.2693 data: 0.0002 max mem: 26157 Train: [48] [3900/6250] eta: 0:10:46 lr: 0.000070 grad: 0.1349 (0.1264) loss: 0.7979 (0.7998) time: 0.2690 data: 0.0002 max mem: 26157 Train: [48] [4000/6250] eta: 0:10:18 lr: 0.000070 grad: 0.1147 (0.1264) loss: 0.7987 (0.7998) time: 0.2726 data: 0.0002 max mem: 26157 Train: [48] [4100/6250] eta: 0:09:50 lr: 0.000070 grad: 0.1302 (0.1264) loss: 0.7876 (0.7997) time: 0.2682 data: 0.0002 max mem: 26157 Train: [48] [4200/6250] eta: 0:09:22 lr: 0.000070 grad: 0.1154 (0.1264) loss: 0.8053 (0.7997) time: 0.2694 data: 0.0001 max mem: 26157 Train: [48] [4300/6250] eta: 0:08:55 lr: 0.000070 grad: 0.1153 (0.1263) loss: 0.7943 (0.7998) time: 0.2700 data: 0.0002 max mem: 26157 Train: [48] [4400/6250] eta: 0:08:30 lr: 0.000070 grad: 0.1186 (0.1262) loss: 0.8045 (0.7998) time: 0.6181 data: 0.3437 max mem: 26157 Train: [48] [4500/6250] eta: 0:08:02 lr: 0.000070 grad: 0.1235 (0.1262) loss: 0.7974 (0.7998) time: 0.2727 data: 0.0002 max mem: 26157 Train: [48] [4600/6250] eta: 0:07:34 lr: 0.000070 grad: 0.1173 (0.1261) loss: 0.8003 (0.7998) time: 0.2685 data: 0.0002 max mem: 26157 Train: [48] [4700/6250] eta: 0:07:07 lr: 0.000070 grad: 0.1165 (0.1260) loss: 0.7981 (0.7999) time: 0.2683 data: 0.0001 max mem: 26157 Train: [48] [4800/6250] eta: 0:06:39 lr: 0.000070 grad: 0.1177 (0.1260) loss: 0.8041 (0.7999) time: 0.2686 data: 0.0001 max mem: 26157 Train: [48] [4900/6250] eta: 0:06:11 lr: 0.000070 grad: 0.1121 (0.1259) loss: 0.7977 (0.7999) time: 0.2693 data: 0.0004 max mem: 26157 Train: [48] [5000/6250] eta: 0:05:44 lr: 0.000070 grad: 0.1316 (0.1259) loss: 0.7959 (0.7999) time: 0.2694 data: 0.0002 max mem: 26157 Train: [48] [5100/6250] eta: 0:05:16 lr: 0.000070 grad: 0.1171 (0.1258) loss: 0.8040 (0.7999) time: 0.2697 data: 0.0002 max mem: 26157 Train: [48] [5200/6250] eta: 0:04:48 lr: 0.000070 grad: 0.1170 (0.1257) loss: 0.7946 (0.7999) time: 0.2678 data: 0.0002 max mem: 26157 Train: [48] [5300/6250] eta: 0:04:21 lr: 0.000070 grad: 0.1211 (0.1257) loss: 0.8005 (0.8000) time: 0.2714 data: 0.0002 max mem: 26157 Train: [48] [5400/6250] eta: 0:03:53 lr: 0.000070 grad: 0.1177 (0.1257) loss: 0.8046 (0.8000) time: 0.2704 data: 0.0001 max mem: 26157 Train: [48] [5500/6250] eta: 0:03:26 lr: 0.000070 grad: 0.1198 (0.1257) loss: 0.7986 (0.8000) time: 0.2695 data: 0.0001 max mem: 26157 Train: [48] [5600/6250] eta: 0:02:58 lr: 0.000070 grad: 0.1207 (0.1257) loss: 0.7991 (0.8000) time: 0.2691 data: 0.0002 max mem: 26157 Train: [48] [5700/6250] eta: 0:02:31 lr: 0.000070 grad: 0.1182 (0.1258) loss: 0.7942 (0.8000) time: 0.2691 data: 0.0002 max mem: 26157 Train: [48] [5800/6250] eta: 0:02:03 lr: 0.000070 grad: 0.1233 (0.1258) loss: 0.8021 (0.8000) time: 0.2709 data: 0.0002 max mem: 26157 Train: [48] [5900/6250] eta: 0:01:36 lr: 0.000070 grad: 0.1131 (0.1258) loss: 0.8072 (0.8001) time: 0.2700 data: 0.0002 max mem: 26157 Train: [48] [6000/6250] eta: 0:01:08 lr: 0.000070 grad: 0.1183 (0.1259) loss: 0.8037 (0.8002) time: 0.2687 data: 0.0002 max mem: 26157 Train: [48] [6100/6250] eta: 0:00:41 lr: 0.000070 grad: 0.1219 (0.1258) loss: 0.8028 (0.8002) time: 0.2692 data: 0.0002 max mem: 26157 Train: [48] [6200/6250] eta: 0:00:13 lr: 0.000070 grad: 0.1128 (0.1258) loss: 0.8050 (0.8003) time: 0.2711 data: 0.0002 max mem: 26157 Train: [48] [6249/6250] eta: 0:00:00 lr: 0.000070 grad: 0.1211 (0.1258) loss: 0.8038 (0.8003) time: 0.2714 data: 0.0002 max mem: 26157 Train: [48] Total time: 0:28:40 (0.2754 s / it) Averaged stats: lr: 0.000070 grad: 0.1211 (0.1258) loss: 0.8038 (0.8003) Eval (hcp-train-subset): [48] [ 0/62] eta: 0:05:43 loss: 0.8359 (0.8359) time: 5.5469 data: 5.4625 max mem: 26157 Eval (hcp-train-subset): [48] [61/62] eta: 0:00:00 loss: 0.8239 (0.8236) time: 0.1148 data: 0.0320 max mem: 26157 Eval (hcp-train-subset): [48] Total time: 0:00:13 (0.2119 s / it) Averaged stats (hcp-train-subset): loss: 0.8239 (0.8236) Making plots (hcp-train-subset): example=23 Eval (hcp-val): [48] [ 0/62] eta: 0:04:14 loss: 0.8208 (0.8208) time: 4.0992 data: 3.9946 max mem: 26157 Eval (hcp-val): [48] [61/62] eta: 0:00:00 loss: 0.8251 (0.8260) time: 0.1293 data: 0.0448 max mem: 26157 Eval (hcp-val): [48] Total time: 0:00:13 (0.2195 s / it) Averaged stats (hcp-val): loss: 0.8251 (0.8260) Making plots (hcp-val): example=50 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [49] [ 0/6250] eta: 8:18:45 lr: 0.000070 grad: 0.1242 (0.1242) loss: 0.8461 (0.8461) time: 4.7881 data: 4.4646 max mem: 26157 Train: [49] [ 100/6250] eta: 0:34:02 lr: 0.000070 grad: 0.1253 (0.1704) loss: 0.8040 (0.8074) time: 0.2714 data: 0.0002 max mem: 26157 Train: [49] [ 200/6250] eta: 0:30:27 lr: 0.000070 grad: 0.1308 (0.1512) loss: 0.7955 (0.8047) time: 0.2723 data: 0.0002 max mem: 26157 Train: [49] [ 300/6250] eta: 0:28:57 lr: 0.000070 grad: 0.1114 (0.1421) loss: 0.8100 (0.8057) time: 0.2721 data: 0.0002 max mem: 26157 Train: [49] [ 400/6250] eta: 0:27:57 lr: 0.000070 grad: 0.1104 (0.1357) loss: 0.8069 (0.8063) time: 0.2715 data: 0.0002 max mem: 26157 Train: [49] [ 500/6250] eta: 0:27:09 lr: 0.000070 grad: 0.1217 (0.1312) loss: 0.8024 (0.8064) time: 0.2696 data: 0.0002 max mem: 26157 Train: [49] [ 600/6250] eta: 0:26:27 lr: 0.000070 grad: 0.1107 (0.1297) loss: 0.8111 (0.8068) time: 0.2691 data: 0.0002 max mem: 26157 Train: [49] [ 700/6250] eta: 0:25:50 lr: 0.000069 grad: 0.1051 (0.1272) loss: 0.8080 (0.8071) time: 0.2685 data: 0.0001 max mem: 26157 Train: [49] [ 800/6250] eta: 0:26:25 lr: 0.000069 grad: 0.1111 (0.1261) loss: 0.8081 (0.8069) time: 0.2792 data: 0.0004 max mem: 26157 Train: [49] [ 900/6250] eta: 0:26:36 lr: 0.000069 grad: 0.1205 (0.1258) loss: 0.8089 (0.8068) time: 0.2683 data: 0.0001 max mem: 26157 Train: [49] [1000/6250] eta: 0:26:07 lr: 0.000069 grad: 0.1111 (0.1250) loss: 0.8064 (0.8068) time: 0.2767 data: 0.0002 max mem: 26157 Train: [49] [1100/6250] eta: 0:26:41 lr: 0.000069 grad: 0.1068 (0.1243) loss: 0.8020 (0.8066) time: 0.6763 data: 0.3973 max mem: 26157 Train: [49] [1200/6250] eta: 0:25:54 lr: 0.000069 grad: 0.1137 (0.1239) loss: 0.8026 (0.8062) time: 0.2697 data: 0.0002 max mem: 26157 Train: [49] [1300/6250] eta: 0:25:09 lr: 0.000069 grad: 0.1161 (0.1235) loss: 0.8070 (0.8058) time: 0.2723 data: 0.0002 max mem: 26157 Train: [49] [1400/6250] eta: 0:25:03 lr: 0.000069 grad: 0.1120 (0.1231) loss: 0.7975 (0.8055) time: 0.7630 data: 0.4877 max mem: 26157 Train: [49] [1500/6250] eta: 0:24:47 lr: 0.000069 grad: 0.1181 (0.1227) loss: 0.7961 (0.8050) time: 0.2709 data: 0.0002 max mem: 26157 Train: [49] [1600/6250] eta: 0:24:04 lr: 0.000069 grad: 0.1270 (0.1229) loss: 0.7928 (0.8044) time: 0.2722 data: 0.0002 max mem: 26157 Train: [49] [1700/6250] eta: 0:23:22 lr: 0.000069 grad: 0.1230 (0.1231) loss: 0.8013 (0.8041) time: 0.2703 data: 0.0002 max mem: 26157 Train: [49] [1800/6250] eta: 0:22:41 lr: 0.000069 grad: 0.1128 (0.1233) loss: 0.7981 (0.8038) time: 0.2685 data: 0.0002 max mem: 26157 Train: [49] [1900/6250] eta: 0:22:17 lr: 0.000069 grad: 0.1160 (0.1236) loss: 0.7888 (0.8034) time: 0.2817 data: 0.0003 max mem: 26157 Train: [49] [2000/6250] eta: 0:21:46 lr: 0.000069 grad: 0.1195 (0.1237) loss: 0.7943 (0.8030) time: 0.2719 data: 0.0004 max mem: 26157 Train: [49] [2100/6250] eta: 0:21:08 lr: 0.000069 grad: 0.1238 (0.1239) loss: 0.7948 (0.8027) time: 0.2703 data: 0.0002 max mem: 26157 Train: [49] [2200/6250] eta: 0:20:31 lr: 0.000069 grad: 0.1096 (0.1241) loss: 0.8012 (0.8025) time: 0.2750 data: 0.0003 max mem: 26157 Train: [49] [2300/6250] eta: 0:19:54 lr: 0.000069 grad: 0.1169 (0.1242) loss: 0.7982 (0.8022) time: 0.2700 data: 0.0002 max mem: 26157 Train: [49] [2400/6250] eta: 0:19:19 lr: 0.000069 grad: 0.1216 (0.1243) loss: 0.8028 (0.8022) time: 0.2693 data: 0.0002 max mem: 26157 Train: [49] [2500/6250] eta: 0:18:44 lr: 0.000069 grad: 0.1277 (0.1245) loss: 0.7942 (0.8019) time: 0.2695 data: 0.0002 max mem: 26157 Train: [49] [2600/6250] eta: 0:18:10 lr: 0.000069 grad: 0.1247 (0.1249) loss: 0.7879 (0.8016) time: 0.2718 data: 0.0002 max mem: 26157 Train: [49] [2700/6250] eta: 0:18:01 lr: 0.000069 grad: 0.1156 (0.1251) loss: 0.7925 (0.8014) time: 0.6583 data: 0.3776 max mem: 26157 Train: [49] [2800/6250] eta: 0:17:33 lr: 0.000069 grad: 0.1174 (0.1255) loss: 0.7941 (0.8012) time: 0.2730 data: 0.0002 max mem: 26157 Train: [49] [2900/6250] eta: 0:16:59 lr: 0.000069 grad: 0.1260 (0.1258) loss: 0.7931 (0.8010) time: 0.2889 data: 0.0090 max mem: 26157 Train: [49] [3000/6250] eta: 0:16:39 lr: 0.000069 grad: 0.1193 (0.1258) loss: 0.8025 (0.8009) time: 0.3992 data: 0.1276 max mem: 26157 Train: [49] [3100/6250] eta: 0:16:05 lr: 0.000069 grad: 0.1236 (0.1261) loss: 0.8009 (0.8008) time: 0.2698 data: 0.0002 max mem: 26157 Train: [49] [3200/6250] eta: 0:15:41 lr: 0.000069 grad: 0.1287 (0.1264) loss: 0.8021 (0.8007) time: 0.2735 data: 0.0002 max mem: 26157 Train: [49] [3300/6250] eta: 0:15:07 lr: 0.000069 grad: 0.1208 (0.1266) loss: 0.7917 (0.8005) time: 0.2688 data: 0.0002 max mem: 26157 Train: [49] [3400/6250] eta: 0:14:33 lr: 0.000069 grad: 0.1287 (0.1267) loss: 0.7890 (0.8003) time: 0.2700 data: 0.0002 max mem: 26157 Train: [49] [3500/6250] eta: 0:13:59 lr: 0.000069 grad: 0.1192 (0.1269) loss: 0.7935 (0.8001) time: 0.2716 data: 0.0002 max mem: 26157 Train: [49] [3600/6250] eta: 0:13:26 lr: 0.000069 grad: 0.1416 (0.1274) loss: 0.7943 (0.7999) time: 0.2687 data: 0.0002 max mem: 26157 Train: [49] [3700/6250] eta: 0:12:53 lr: 0.000069 grad: 0.1255 (0.1278) loss: 0.7892 (0.7997) time: 0.2702 data: 0.0002 max mem: 26157 Train: [49] [3800/6250] eta: 0:12:21 lr: 0.000068 grad: 0.1330 (0.1281) loss: 0.7953 (0.7996) time: 0.2692 data: 0.0002 max mem: 26157 Train: [49] [3900/6250] eta: 0:11:49 lr: 0.000068 grad: 0.1206 (0.1283) loss: 0.7943 (0.7995) time: 0.2737 data: 0.0002 max mem: 26157 Train: [49] [4000/6250] eta: 0:11:17 lr: 0.000068 grad: 0.1169 (0.1282) loss: 0.7904 (0.7994) time: 0.2710 data: 0.0002 max mem: 26157 Train: [49] [4100/6250] eta: 0:10:45 lr: 0.000068 grad: 0.1127 (0.1283) loss: 0.7982 (0.7992) time: 0.2692 data: 0.0002 max mem: 26157 Train: [49] [4200/6250] eta: 0:10:14 lr: 0.000068 grad: 0.1267 (0.1283) loss: 0.8030 (0.7993) time: 0.2737 data: 0.0002 max mem: 26157 Train: [49] [4300/6250] eta: 0:09:42 lr: 0.000068 grad: 0.1142 (0.1283) loss: 0.8002 (0.7993) time: 0.2690 data: 0.0002 max mem: 26157 Train: [49] [4400/6250] eta: 0:09:11 lr: 0.000068 grad: 0.1198 (0.1283) loss: 0.7982 (0.7993) time: 0.2705 data: 0.0002 max mem: 26157 Train: [49] [4500/6250] eta: 0:08:43 lr: 0.000068 grad: 0.1159 (0.1283) loss: 0.8025 (0.7993) time: 0.5447 data: 0.2556 max mem: 26157 Train: [49] [4600/6250] eta: 0:08:12 lr: 0.000068 grad: 0.1255 (0.1285) loss: 0.7965 (0.7994) time: 0.2696 data: 0.0002 max mem: 26157 Train: [49] [4700/6250] eta: 0:07:41 lr: 0.000068 grad: 0.1190 (0.1286) loss: 0.8020 (0.7994) time: 0.2712 data: 0.0002 max mem: 26157 Train: [49] [4800/6250] eta: 0:07:11 lr: 0.000068 grad: 0.1242 (0.1286) loss: 0.7983 (0.7995) time: 0.2700 data: 0.0002 max mem: 26157 Train: [49] [4900/6250] eta: 0:06:40 lr: 0.000068 grad: 0.1208 (0.1286) loss: 0.8040 (0.7995) time: 0.2695 data: 0.0002 max mem: 26157 Train: [49] [5000/6250] eta: 0:06:10 lr: 0.000068 grad: 0.1243 (0.1286) loss: 0.7987 (0.7996) time: 0.2693 data: 0.0002 max mem: 26157 Train: [49] [5100/6250] eta: 0:05:40 lr: 0.000068 grad: 0.1133 (0.1286) loss: 0.8022 (0.7997) time: 0.2695 data: 0.0002 max mem: 26157 Train: [49] [5200/6250] eta: 0:05:10 lr: 0.000068 grad: 0.1264 (0.1286) loss: 0.7975 (0.7997) time: 0.2718 data: 0.0002 max mem: 26157 Train: [49] [5300/6250] eta: 0:04:40 lr: 0.000068 grad: 0.1145 (0.1286) loss: 0.8038 (0.7997) time: 0.2692 data: 0.0002 max mem: 26157 Train: [49] [5400/6250] eta: 0:04:10 lr: 0.000068 grad: 0.1269 (0.1287) loss: 0.8060 (0.7997) time: 0.2694 data: 0.0002 max mem: 26157 Train: [49] [5500/6250] eta: 0:03:40 lr: 0.000068 grad: 0.1165 (0.1288) loss: 0.8010 (0.7997) time: 0.2701 data: 0.0002 max mem: 26157 Train: [49] [5600/6250] eta: 0:03:10 lr: 0.000068 grad: 0.1253 (0.1289) loss: 0.7987 (0.7998) time: 0.2692 data: 0.0002 max mem: 26157 Train: [49] [5700/6250] eta: 0:02:41 lr: 0.000068 grad: 0.1166 (0.1288) loss: 0.8035 (0.7998) time: 0.2711 data: 0.0002 max mem: 26157 Train: [49] [5800/6250] eta: 0:02:11 lr: 0.000068 grad: 0.1244 (0.1288) loss: 0.7977 (0.7999) time: 0.2716 data: 0.0002 max mem: 26157 Train: [49] [5900/6250] eta: 0:01:42 lr: 0.000068 grad: 0.1167 (0.1288) loss: 0.8028 (0.7999) time: 0.2690 data: 0.0002 max mem: 26157 Train: [49] [6000/6250] eta: 0:01:12 lr: 0.000068 grad: 0.1104 (0.1289) loss: 0.8050 (0.8000) time: 0.2690 data: 0.0002 max mem: 26157 Train: [49] [6100/6250] eta: 0:00:43 lr: 0.000068 grad: 0.1213 (0.1289) loss: 0.7958 (0.8000) time: 0.2718 data: 0.0002 max mem: 26157 Train: [49] [6200/6250] eta: 0:00:14 lr: 0.000068 grad: 0.1239 (0.1290) loss: 0.8036 (0.8001) time: 0.2726 data: 0.0002 max mem: 26157 Train: [49] [6249/6250] eta: 0:00:00 lr: 0.000068 grad: 0.1254 (0.1290) loss: 0.7952 (0.8001) time: 0.2712 data: 0.0002 max mem: 26157 Train: [49] Total time: 0:30:26 (0.2922 s / it) Averaged stats: lr: 0.000068 grad: 0.1254 (0.1290) loss: 0.7952 (0.8001) Eval (hcp-train-subset): [49] [ 0/62] eta: 0:04:14 loss: 0.8335 (0.8335) time: 4.0996 data: 3.9696 max mem: 26157 Eval (hcp-train-subset): [49] [61/62] eta: 0:00:00 loss: 0.8240 (0.8249) time: 0.1281 data: 0.0437 max mem: 26157 Eval (hcp-train-subset): [49] Total time: 0:00:13 (0.2139 s / it) Averaged stats (hcp-train-subset): loss: 0.8240 (0.8249) Making plots (hcp-train-subset): example=59 Eval (hcp-val): [49] [ 0/62] eta: 0:05:37 loss: 0.8228 (0.8228) time: 5.4385 data: 5.3537 max mem: 26157 Eval (hcp-val): [49] [61/62] eta: 0:00:00 loss: 0.8244 (0.8258) time: 0.1360 data: 0.0529 max mem: 26157 Eval (hcp-val): [49] Total time: 0:00:13 (0.2211 s / it) Averaged stats (hcp-val): loss: 0.8244 (0.8258) Making plots (hcp-val): example=33 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [50] [ 0/6250] eta: 7:45:08 lr: 0.000068 grad: 0.0737 (0.0737) loss: 0.8313 (0.8313) time: 4.4654 data: 4.0874 max mem: 26157 Train: [50] [ 100/6250] eta: 0:34:21 lr: 0.000068 grad: 0.1583 (0.1611) loss: 0.7894 (0.8068) time: 0.2706 data: 0.0004 max mem: 26157 Train: [50] [ 200/6250] eta: 0:30:37 lr: 0.000068 grad: 0.1404 (0.1539) loss: 0.7894 (0.8019) time: 0.2732 data: 0.0003 max mem: 26157 Train: [50] [ 300/6250] eta: 0:29:00 lr: 0.000068 grad: 0.1279 (0.1510) loss: 0.8043 (0.8008) time: 0.2724 data: 0.0002 max mem: 26157 Train: [50] [ 400/6250] eta: 0:27:59 lr: 0.000068 grad: 0.1186 (0.1459) loss: 0.8006 (0.8010) time: 0.2697 data: 0.0002 max mem: 26157 Train: [50] [ 500/6250] eta: 0:27:12 lr: 0.000067 grad: 0.1346 (0.1447) loss: 0.7976 (0.8009) time: 0.2695 data: 0.0002 max mem: 26157 Train: [50] [ 600/6250] eta: 0:26:30 lr: 0.000067 grad: 0.1195 (0.1446) loss: 0.8046 (0.8004) time: 0.2690 data: 0.0002 max mem: 26157 Train: [50] [ 700/6250] eta: 0:25:52 lr: 0.000067 grad: 0.1164 (0.1431) loss: 0.8018 (0.8000) time: 0.2704 data: 0.0001 max mem: 26157 Train: [50] [ 800/6250] eta: 0:25:17 lr: 0.000067 grad: 0.1206 (0.1405) loss: 0.8104 (0.8003) time: 0.2693 data: 0.0002 max mem: 26157 Train: [50] [ 900/6250] eta: 0:24:45 lr: 0.000067 grad: 0.1194 (0.1407) loss: 0.8073 (0.8004) time: 0.2710 data: 0.0002 max mem: 26157 Train: [50] [1000/6250] eta: 0:24:13 lr: 0.000067 grad: 0.1180 (0.1397) loss: 0.8014 (0.8007) time: 0.2694 data: 0.0002 max mem: 26157 Train: [50] [1100/6250] eta: 0:23:42 lr: 0.000067 grad: 0.1110 (0.1377) loss: 0.8016 (0.8008) time: 0.2685 data: 0.0002 max mem: 26157 Train: [50] [1200/6250] eta: 0:23:12 lr: 0.000067 grad: 0.1110 (0.1372) loss: 0.7932 (0.8007) time: 0.2688 data: 0.0002 max mem: 26157 Train: [50] [1300/6250] eta: 0:22:42 lr: 0.000067 grad: 0.1115 (0.1363) loss: 0.8041 (0.8008) time: 0.2702 data: 0.0002 max mem: 26157 Train: [50] [1400/6250] eta: 0:22:13 lr: 0.000067 grad: 0.1190 (0.1361) loss: 0.7985 (0.8009) time: 0.2719 data: 0.0002 max mem: 26157 Train: [50] [1500/6250] eta: 0:21:44 lr: 0.000067 grad: 0.1146 (0.1355) loss: 0.8013 (0.8008) time: 0.2720 data: 0.0004 max mem: 26157 Train: [50] [1600/6250] eta: 0:21:25 lr: 0.000067 grad: 0.1043 (0.1345) loss: 0.8025 (0.8008) time: 0.2712 data: 0.0002 max mem: 26157 Train: [50] [1700/6250] eta: 0:20:56 lr: 0.000067 grad: 0.1159 (0.1342) loss: 0.8011 (0.8007) time: 0.2683 data: 0.0002 max mem: 26157 Train: [50] [1800/6250] eta: 0:20:27 lr: 0.000067 grad: 0.1153 (0.1339) loss: 0.8011 (0.8006) time: 0.2733 data: 0.0002 max mem: 26157 Train: [50] [1900/6250] eta: 0:20:16 lr: 0.000067 grad: 0.1175 (0.1339) loss: 0.8031 (0.8006) time: 0.2704 data: 0.0002 max mem: 26157 Train: [50] [2000/6250] eta: 0:19:46 lr: 0.000067 grad: 0.1334 (0.1341) loss: 0.7995 (0.8005) time: 0.2680 data: 0.0002 max mem: 26157 Train: [50] [2100/6250] eta: 0:19:16 lr: 0.000067 grad: 0.1181 (0.1339) loss: 0.7986 (0.8003) time: 0.2693 data: 0.0002 max mem: 26157 Train: [50] [2200/6250] eta: 0:18:47 lr: 0.000067 grad: 0.1186 (0.1336) loss: 0.7964 (0.8002) time: 0.2749 data: 0.0002 max mem: 26157 Train: [50] [2300/6250] eta: 0:18:25 lr: 0.000067 grad: 0.1203 (0.1337) loss: 0.7998 (0.7999) time: 0.2690 data: 0.0002 max mem: 26157 Train: [50] [2400/6250] eta: 0:17:55 lr: 0.000067 grad: 0.1184 (0.1332) loss: 0.7948 (0.7998) time: 0.2699 data: 0.0002 max mem: 26157 Train: [50] [2500/6250] eta: 0:17:26 lr: 0.000067 grad: 0.1132 (0.1328) loss: 0.7974 (0.7999) time: 0.2680 data: 0.0002 max mem: 26157 Train: [50] [2600/6250] eta: 0:16:56 lr: 0.000067 grad: 0.1201 (0.1326) loss: 0.7987 (0.7999) time: 0.2693 data: 0.0002 max mem: 26157 Train: [50] [2700/6250] eta: 0:16:27 lr: 0.000067 grad: 0.1192 (0.1326) loss: 0.7974 (0.7998) time: 0.2712 data: 0.0002 max mem: 26157 Train: [50] [2800/6250] eta: 0:15:59 lr: 0.000067 grad: 0.1192 (0.1329) loss: 0.7964 (0.7998) time: 0.2692 data: 0.0002 max mem: 26157 Train: [50] [2900/6250] eta: 0:15:30 lr: 0.000067 grad: 0.1240 (0.1331) loss: 0.8036 (0.7998) time: 0.2706 data: 0.0002 max mem: 26157 Train: [50] [3000/6250] eta: 0:15:01 lr: 0.000067 grad: 0.1207 (0.1331) loss: 0.8012 (0.7998) time: 0.2699 data: 0.0002 max mem: 26157 Train: [50] [3100/6250] eta: 0:14:33 lr: 0.000067 grad: 0.1174 (0.1331) loss: 0.8011 (0.7998) time: 0.2733 data: 0.0002 max mem: 26157 Train: [50] [3200/6250] eta: 0:14:30 lr: 0.000067 grad: 0.1153 (0.1329) loss: 0.7986 (0.7997) time: 0.8742 data: 0.6048 max mem: 26157 Train: [50] [3300/6250] eta: 0:14:00 lr: 0.000067 grad: 0.1307 (0.1331) loss: 0.7953 (0.7996) time: 0.2743 data: 0.0002 max mem: 26157 Train: [50] [3400/6250] eta: 0:13:33 lr: 0.000067 grad: 0.1280 (0.1333) loss: 0.7958 (0.7995) time: 0.2711 data: 0.0002 max mem: 26157 Train: [50] [3500/6250] eta: 0:13:03 lr: 0.000067 grad: 0.1383 (0.1337) loss: 0.7891 (0.7994) time: 0.2700 data: 0.0002 max mem: 26157 Train: [50] [3600/6250] eta: 0:12:36 lr: 0.000066 grad: 0.1231 (0.1337) loss: 0.7975 (0.7993) time: 0.3139 data: 0.0378 max mem: 26157 Train: [50] [3700/6250] eta: 0:12:08 lr: 0.000066 grad: 0.1262 (0.1337) loss: 0.7979 (0.7992) time: 0.2692 data: 0.0002 max mem: 26157 Train: [50] [3800/6250] eta: 0:11:38 lr: 0.000066 grad: 0.1306 (0.1338) loss: 0.7971 (0.7990) time: 0.2702 data: 0.0002 max mem: 26157 Train: [50] [3900/6250] eta: 0:11:09 lr: 0.000066 grad: 0.1202 (0.1339) loss: 0.7956 (0.7989) time: 0.2697 data: 0.0002 max mem: 26157 Train: [50] [4000/6250] eta: 0:10:39 lr: 0.000066 grad: 0.1314 (0.1339) loss: 0.7928 (0.7987) time: 0.2697 data: 0.0003 max mem: 26157 Train: [50] [4100/6250] eta: 0:10:10 lr: 0.000066 grad: 0.1231 (0.1341) loss: 0.7886 (0.7986) time: 0.2736 data: 0.0002 max mem: 26157 Train: [50] [4200/6250] eta: 0:09:41 lr: 0.000066 grad: 0.1315 (0.1341) loss: 0.7948 (0.7985) time: 0.2702 data: 0.0002 max mem: 26157 Train: [50] [4300/6250] eta: 0:09:19 lr: 0.000066 grad: 0.1318 (0.1342) loss: 0.7936 (0.7984) time: 0.2743 data: 0.0002 max mem: 26157 Train: [50] [4400/6250] eta: 0:08:49 lr: 0.000066 grad: 0.1168 (0.1341) loss: 0.8005 (0.7983) time: 0.2716 data: 0.0002 max mem: 26157 Train: [50] [4500/6250] eta: 0:08:20 lr: 0.000066 grad: 0.1245 (0.1340) loss: 0.7972 (0.7983) time: 0.2711 data: 0.0002 max mem: 26157 Train: [50] [4600/6250] eta: 0:07:51 lr: 0.000066 grad: 0.1276 (0.1340) loss: 0.8012 (0.7983) time: 0.2673 data: 0.0002 max mem: 26157 Train: [50] [4700/6250] eta: 0:07:22 lr: 0.000066 grad: 0.1200 (0.1339) loss: 0.7928 (0.7984) time: 0.2695 data: 0.0002 max mem: 26157 Train: [50] [4800/6250] eta: 0:06:53 lr: 0.000066 grad: 0.1210 (0.1339) loss: 0.8060 (0.7984) time: 0.2688 data: 0.0002 max mem: 26157 Train: [50] [4900/6250] eta: 0:06:24 lr: 0.000066 grad: 0.1182 (0.1339) loss: 0.8003 (0.7984) time: 0.2674 data: 0.0002 max mem: 26157 Train: [50] [5000/6250] eta: 0:05:55 lr: 0.000066 grad: 0.1225 (0.1341) loss: 0.7925 (0.7984) time: 0.2697 data: 0.0002 max mem: 26157 Train: [50] [5100/6250] eta: 0:05:26 lr: 0.000066 grad: 0.1220 (0.1341) loss: 0.7996 (0.7984) time: 0.2680 data: 0.0002 max mem: 26157 Train: [50] [5200/6250] eta: 0:04:57 lr: 0.000066 grad: 0.1218 (0.1339) loss: 0.7986 (0.7983) time: 0.2689 data: 0.0002 max mem: 26157 Train: [50] [5300/6250] eta: 0:04:29 lr: 0.000066 grad: 0.1239 (0.1338) loss: 0.7945 (0.7984) time: 0.2684 data: 0.0001 max mem: 26157 Train: [50] [5400/6250] eta: 0:04:00 lr: 0.000066 grad: 0.1308 (0.1338) loss: 0.7921 (0.7984) time: 0.2684 data: 0.0002 max mem: 26157 Train: [50] [5500/6250] eta: 0:03:32 lr: 0.000066 grad: 0.1221 (0.1340) loss: 0.7971 (0.7984) time: 0.2684 data: 0.0001 max mem: 26157 Train: [50] [5600/6250] eta: 0:03:03 lr: 0.000066 grad: 0.1219 (0.1340) loss: 0.8026 (0.7984) time: 0.2698 data: 0.0002 max mem: 26157 Train: [50] [5700/6250] eta: 0:02:35 lr: 0.000066 grad: 0.1248 (0.1341) loss: 0.7976 (0.7984) time: 0.2726 data: 0.0002 max mem: 26157 Train: [50] [5800/6250] eta: 0:02:07 lr: 0.000066 grad: 0.1175 (0.1341) loss: 0.8021 (0.7984) time: 0.2705 data: 0.0002 max mem: 26157 Train: [50] [5900/6250] eta: 0:01:38 lr: 0.000066 grad: 0.1287 (0.1342) loss: 0.7936 (0.7984) time: 0.2701 data: 0.0002 max mem: 26157 Train: [50] [6000/6250] eta: 0:01:10 lr: 0.000066 grad: 0.1302 (0.1343) loss: 0.7966 (0.7983) time: 0.2699 data: 0.0002 max mem: 26157 Train: [50] [6100/6250] eta: 0:00:42 lr: 0.000066 grad: 0.1222 (0.1342) loss: 0.8003 (0.7984) time: 0.2707 data: 0.0002 max mem: 26157 Train: [50] [6200/6250] eta: 0:00:14 lr: 0.000066 grad: 0.1258 (0.1345) loss: 0.7957 (0.7983) time: 0.7128 data: 0.4313 max mem: 26157 Train: [50] [6249/6250] eta: 0:00:00 lr: 0.000066 grad: 0.1308 (0.1346) loss: 0.7977 (0.7983) time: 0.4723 data: 0.1976 max mem: 26157 Train: [50] Total time: 0:29:51 (0.2867 s / it) Averaged stats: lr: 0.000066 grad: 0.1308 (0.1346) loss: 0.7977 (0.7983) Eval (hcp-train-subset): [50] [ 0/62] eta: 0:04:46 loss: 0.8307 (0.8307) time: 4.6236 data: 4.5032 max mem: 26157 Eval (hcp-train-subset): [50] [61/62] eta: 0:00:00 loss: 0.8217 (0.8223) time: 0.1465 data: 0.0634 max mem: 26157 Eval (hcp-train-subset): [50] Total time: 0:00:13 (0.2202 s / it) Averaged stats (hcp-train-subset): loss: 0.8217 (0.8223) Making plots (hcp-train-subset): example=36 Eval (hcp-val): [50] [ 0/62] eta: 0:04:52 loss: 0.8234 (0.8234) time: 4.7252 data: 4.6416 max mem: 26157 Eval (hcp-val): [50] [61/62] eta: 0:00:00 loss: 0.8235 (0.8251) time: 0.1146 data: 0.0318 max mem: 26157 Eval (hcp-val): [50] Total time: 0:00:13 (0.2104 s / it) Averaged stats (hcp-val): loss: 0.8235 (0.8251) Making plots (hcp-val): example=39 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [51] [ 0/6250] eta: 8:20:08 lr: 0.000066 grad: 0.2176 (0.2176) loss: 0.7914 (0.7914) time: 4.8013 data: 4.4505 max mem: 26157 Train: [51] [ 100/6250] eta: 0:33:56 lr: 0.000066 grad: 0.1245 (0.1457) loss: 0.8146 (0.8204) time: 0.2703 data: 0.0002 max mem: 26157 Train: [51] [ 200/6250] eta: 0:30:19 lr: 0.000066 grad: 0.1370 (0.1411) loss: 0.8082 (0.8126) time: 0.2697 data: 0.0002 max mem: 26157 Train: [51] [ 300/6250] eta: 0:28:49 lr: 0.000065 grad: 0.1295 (0.1380) loss: 0.8120 (0.8093) time: 0.2703 data: 0.0002 max mem: 26157 Train: [51] [ 400/6250] eta: 0:27:49 lr: 0.000065 grad: 0.1258 (0.1366) loss: 0.7906 (0.8070) time: 0.2686 data: 0.0002 max mem: 26157 Train: [51] [ 500/6250] eta: 0:27:00 lr: 0.000065 grad: 0.1155 (0.1346) loss: 0.8047 (0.8063) time: 0.2677 data: 0.0001 max mem: 26157 Train: [51] [ 600/6250] eta: 0:26:23 lr: 0.000065 grad: 0.1284 (0.1339) loss: 0.8038 (0.8053) time: 0.2716 data: 0.0002 max mem: 26157 Train: [51] [ 700/6250] eta: 0:25:46 lr: 0.000065 grad: 0.1218 (0.1344) loss: 0.8039 (0.8043) time: 0.2691 data: 0.0002 max mem: 26157 Train: [51] [ 800/6250] eta: 0:25:12 lr: 0.000065 grad: 0.1202 (0.1343) loss: 0.7986 (0.8038) time: 0.2688 data: 0.0001 max mem: 26157 Train: [51] [ 900/6250] eta: 0:24:39 lr: 0.000065 grad: 0.1206 (0.1337) loss: 0.8026 (0.8035) time: 0.2688 data: 0.0001 max mem: 26157 Train: [51] [1000/6250] eta: 0:24:09 lr: 0.000065 grad: 0.1090 (0.1325) loss: 0.8076 (0.8033) time: 0.2690 data: 0.0002 max mem: 26157 Train: [51] [1100/6250] eta: 0:23:38 lr: 0.000065 grad: 0.1247 (0.1318) loss: 0.7934 (0.8025) time: 0.2687 data: 0.0002 max mem: 26157 Train: [51] [1200/6250] eta: 0:23:08 lr: 0.000065 grad: 0.1198 (0.1316) loss: 0.7982 (0.8020) time: 0.2691 data: 0.0002 max mem: 26157 Train: [51] [1300/6250] eta: 0:22:38 lr: 0.000065 grad: 0.1197 (0.1312) loss: 0.7996 (0.8017) time: 0.2685 data: 0.0002 max mem: 26157 Train: [51] [1400/6250] eta: 0:22:09 lr: 0.000065 grad: 0.1210 (0.1311) loss: 0.7982 (0.8015) time: 0.2713 data: 0.0002 max mem: 26157 Train: [51] [1500/6250] eta: 0:21:41 lr: 0.000065 grad: 0.1195 (0.1311) loss: 0.7961 (0.8011) time: 0.2764 data: 0.0003 max mem: 26157 Train: [51] [1600/6250] eta: 0:21:27 lr: 0.000065 grad: 0.1148 (0.1313) loss: 0.8029 (0.8009) time: 0.2742 data: 0.0002 max mem: 26157 Train: [51] [1700/6250] eta: 0:21:10 lr: 0.000065 grad: 0.1174 (0.1316) loss: 0.7976 (0.8007) time: 0.2700 data: 0.0002 max mem: 26157 Train: [51] [1800/6250] eta: 0:20:45 lr: 0.000065 grad: 0.1232 (0.1320) loss: 0.7965 (0.8004) time: 0.2721 data: 0.0002 max mem: 26157 Train: [51] [1900/6250] eta: 0:20:27 lr: 0.000065 grad: 0.1410 (0.1323) loss: 0.7963 (0.8002) time: 0.2715 data: 0.0002 max mem: 26157 Train: [51] [2000/6250] eta: 0:20:06 lr: 0.000065 grad: 0.1300 (0.1322) loss: 0.7980 (0.8001) time: 0.3112 data: 0.0396 max mem: 26157 Train: [51] [2100/6250] eta: 0:19:36 lr: 0.000065 grad: 0.1271 (0.1323) loss: 0.7932 (0.8000) time: 0.2710 data: 0.0002 max mem: 26157 Train: [51] [2200/6250] eta: 0:19:14 lr: 0.000065 grad: 0.1171 (0.1324) loss: 0.8023 (0.7999) time: 0.2700 data: 0.0002 max mem: 26157 Train: [51] [2300/6250] eta: 0:18:43 lr: 0.000065 grad: 0.1142 (0.1323) loss: 0.7988 (0.7998) time: 0.2694 data: 0.0002 max mem: 26157 Train: [51] [2400/6250] eta: 0:18:13 lr: 0.000065 grad: 0.1281 (0.1324) loss: 0.7967 (0.7998) time: 0.2690 data: 0.0002 max mem: 26157 Train: [51] [2500/6250] eta: 0:17:42 lr: 0.000065 grad: 0.1204 (0.1322) loss: 0.8003 (0.7997) time: 0.2713 data: 0.0002 max mem: 26157 Train: [51] [2600/6250] eta: 0:17:12 lr: 0.000065 grad: 0.1253 (0.1324) loss: 0.8019 (0.7997) time: 0.2709 data: 0.0002 max mem: 26157 Train: [51] [2700/6250] eta: 0:16:42 lr: 0.000065 grad: 0.1235 (0.1322) loss: 0.7969 (0.7997) time: 0.2711 data: 0.0002 max mem: 26157 Train: [51] [2800/6250] eta: 0:16:13 lr: 0.000065 grad: 0.1184 (0.1322) loss: 0.7989 (0.7996) time: 0.2702 data: 0.0002 max mem: 26157 Train: [51] [2900/6250] eta: 0:15:45 lr: 0.000065 grad: 0.1196 (0.1321) loss: 0.8044 (0.7996) time: 0.2696 data: 0.0002 max mem: 26157 Train: [51] [3000/6250] eta: 0:15:16 lr: 0.000065 grad: 0.1165 (0.1321) loss: 0.7937 (0.7994) time: 0.2723 data: 0.0002 max mem: 26157 Train: [51] [3100/6250] eta: 0:14:46 lr: 0.000065 grad: 0.1256 (0.1323) loss: 0.7954 (0.7994) time: 0.2712 data: 0.0002 max mem: 26157 Train: [51] [3200/6250] eta: 0:14:17 lr: 0.000065 grad: 0.1155 (0.1323) loss: 0.7980 (0.7993) time: 0.2701 data: 0.0002 max mem: 26157 Train: [51] [3300/6250] eta: 0:13:48 lr: 0.000065 grad: 0.1205 (0.1322) loss: 0.7944 (0.7992) time: 0.2700 data: 0.0002 max mem: 26157 Train: [51] [3400/6250] eta: 0:13:19 lr: 0.000064 grad: 0.1225 (0.1320) loss: 0.7961 (0.7991) time: 0.2695 data: 0.0002 max mem: 26157 Train: [51] [3500/6250] eta: 0:12:50 lr: 0.000064 grad: 0.1167 (0.1321) loss: 0.7978 (0.7990) time: 0.2702 data: 0.0002 max mem: 26157 Train: [51] [3600/6250] eta: 0:12:21 lr: 0.000064 grad: 0.1194 (0.1322) loss: 0.8040 (0.7990) time: 0.2688 data: 0.0002 max mem: 26157 Train: [51] [3700/6250] eta: 0:11:53 lr: 0.000064 grad: 0.1285 (0.1322) loss: 0.7948 (0.7989) time: 0.2727 data: 0.0002 max mem: 26157 Train: [51] [3800/6250] eta: 0:11:28 lr: 0.000064 grad: 0.1262 (0.1321) loss: 0.7950 (0.7989) time: 0.2698 data: 0.0002 max mem: 26157 Train: [51] [3900/6250] eta: 0:11:00 lr: 0.000064 grad: 0.1208 (0.1323) loss: 0.8012 (0.7988) time: 0.2675 data: 0.0002 max mem: 26157 Train: [51] [4000/6250] eta: 0:10:31 lr: 0.000064 grad: 0.1348 (0.1325) loss: 0.7946 (0.7988) time: 0.2689 data: 0.0001 max mem: 26157 Train: [51] [4100/6250] eta: 0:10:04 lr: 0.000064 grad: 0.1309 (0.1325) loss: 0.7992 (0.7988) time: 0.2721 data: 0.0002 max mem: 26157 Train: [51] [4200/6250] eta: 0:09:40 lr: 0.000064 grad: 0.1229 (0.1325) loss: 0.8002 (0.7988) time: 0.5639 data: 0.2899 max mem: 26157 Train: [51] [4300/6250] eta: 0:09:13 lr: 0.000064 grad: 0.1285 (0.1326) loss: 0.7953 (0.7988) time: 0.2718 data: 0.0002 max mem: 26157 Train: [51] [4400/6250] eta: 0:08:50 lr: 0.000064 grad: 0.1260 (0.1327) loss: 0.7931 (0.7987) time: 0.2704 data: 0.0002 max mem: 26157 Train: [51] [4500/6250] eta: 0:08:21 lr: 0.000064 grad: 0.1214 (0.1328) loss: 0.7975 (0.7987) time: 0.2710 data: 0.0002 max mem: 26157 Train: [51] [4600/6250] eta: 0:07:52 lr: 0.000064 grad: 0.1230 (0.1328) loss: 0.7881 (0.7986) time: 0.2692 data: 0.0002 max mem: 26157 Train: [51] [4700/6250] eta: 0:07:22 lr: 0.000064 grad: 0.1306 (0.1330) loss: 0.7938 (0.7985) time: 0.2693 data: 0.0001 max mem: 26157 Train: [51] [4800/6250] eta: 0:06:53 lr: 0.000064 grad: 0.1328 (0.1332) loss: 0.7890 (0.7984) time: 0.2698 data: 0.0002 max mem: 26157 Train: [51] [4900/6250] eta: 0:06:24 lr: 0.000064 grad: 0.1295 (0.1336) loss: 0.7984 (0.7983) time: 0.2697 data: 0.0002 max mem: 26157 Train: [51] [5000/6250] eta: 0:05:55 lr: 0.000064 grad: 0.1343 (0.1338) loss: 0.7868 (0.7982) time: 0.2691 data: 0.0002 max mem: 26157 Train: [51] [5100/6250] eta: 0:05:27 lr: 0.000064 grad: 0.1254 (0.1339) loss: 0.7916 (0.7981) time: 0.2683 data: 0.0002 max mem: 26157 Train: [51] [5200/6250] eta: 0:04:58 lr: 0.000064 grad: 0.1330 (0.1341) loss: 0.7843 (0.7979) time: 0.2685 data: 0.0002 max mem: 26157 Train: [51] [5300/6250] eta: 0:04:29 lr: 0.000064 grad: 0.1348 (0.1343) loss: 0.7898 (0.7978) time: 0.2691 data: 0.0002 max mem: 26157 Train: [51] [5400/6250] eta: 0:04:01 lr: 0.000064 grad: 0.1377 (0.1345) loss: 0.7966 (0.7977) time: 0.2690 data: 0.0002 max mem: 26157 Train: [51] [5500/6250] eta: 0:03:32 lr: 0.000064 grad: 0.1295 (0.1347) loss: 0.7955 (0.7975) time: 0.2706 data: 0.0002 max mem: 26157 Train: [51] [5600/6250] eta: 0:03:04 lr: 0.000064 grad: 0.1306 (0.1349) loss: 0.7888 (0.7974) time: 0.2712 data: 0.0002 max mem: 26157 Train: [51] [5700/6250] eta: 0:02:36 lr: 0.000064 grad: 0.1355 (0.1351) loss: 0.7893 (0.7973) time: 0.2682 data: 0.0002 max mem: 26157 Train: [51] [5800/6250] eta: 0:02:07 lr: 0.000064 grad: 0.1262 (0.1351) loss: 0.7907 (0.7972) time: 0.2685 data: 0.0002 max mem: 26157 Train: [51] [5900/6250] eta: 0:01:39 lr: 0.000064 grad: 0.1347 (0.1351) loss: 0.7903 (0.7972) time: 0.2751 data: 0.0002 max mem: 26157 Train: [51] [6000/6250] eta: 0:01:10 lr: 0.000064 grad: 0.1192 (0.1352) loss: 0.7897 (0.7971) time: 0.2701 data: 0.0002 max mem: 26157 Train: [51] [6100/6250] eta: 0:00:42 lr: 0.000064 grad: 0.1213 (0.1352) loss: 0.7989 (0.7971) time: 0.2684 data: 0.0002 max mem: 26157 Train: [51] [6200/6250] eta: 0:00:14 lr: 0.000064 grad: 0.1238 (0.1351) loss: 0.7879 (0.7971) time: 0.2769 data: 0.0002 max mem: 26157 Train: [51] [6249/6250] eta: 0:00:00 lr: 0.000064 grad: 0.1281 (0.1351) loss: 0.7912 (0.7970) time: 0.2695 data: 0.0002 max mem: 26157 Train: [51] Total time: 0:29:35 (0.2841 s / it) Averaged stats: lr: 0.000064 grad: 0.1281 (0.1351) loss: 0.7912 (0.7970) Eval (hcp-train-subset): [51] [ 0/62] eta: 0:03:38 loss: 0.8349 (0.8349) time: 3.5191 data: 3.4074 max mem: 26157 Eval (hcp-train-subset): [51] [61/62] eta: 0:00:00 loss: 0.8209 (0.8229) time: 0.1363 data: 0.0517 max mem: 26157 Eval (hcp-train-subset): [51] Total time: 0:00:12 (0.2081 s / it) Averaged stats (hcp-train-subset): loss: 0.8209 (0.8229) Making plots (hcp-train-subset): example=59 Eval (hcp-val): [51] [ 0/62] eta: 0:04:39 loss: 0.8218 (0.8218) time: 4.5029 data: 4.3941 max mem: 26157 Eval (hcp-val): [51] [61/62] eta: 0:00:00 loss: 0.8255 (0.8263) time: 0.1366 data: 0.0537 max mem: 26157 Eval (hcp-val): [51] Total time: 0:00:13 (0.2141 s / it) Averaged stats (hcp-val): loss: 0.8255 (0.8263) Making plots (hcp-val): example=9 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [52] [ 0/6250] eta: 8:35:51 lr: 0.000064 grad: 0.1210 (0.1210) loss: 0.8227 (0.8227) time: 4.9522 data: 4.6101 max mem: 26157 Train: [52] [ 100/6250] eta: 0:33:04 lr: 0.000063 grad: 0.1371 (0.1805) loss: 0.7902 (0.8050) time: 0.2687 data: 0.0002 max mem: 26157 Train: [52] [ 200/6250] eta: 0:29:47 lr: 0.000063 grad: 0.1230 (0.1607) loss: 0.7981 (0.8024) time: 0.2687 data: 0.0002 max mem: 26157 Train: [52] [ 300/6250] eta: 0:28:26 lr: 0.000063 grad: 0.1341 (0.1522) loss: 0.7887 (0.8014) time: 0.2703 data: 0.0002 max mem: 26157 Train: [52] [ 400/6250] eta: 0:27:34 lr: 0.000063 grad: 0.1388 (0.1504) loss: 0.7949 (0.8013) time: 0.2737 data: 0.0002 max mem: 26157 Train: [52] [ 500/6250] eta: 0:26:53 lr: 0.000063 grad: 0.1390 (0.1484) loss: 0.8024 (0.8008) time: 0.2703 data: 0.0002 max mem: 26157 Train: [52] [ 600/6250] eta: 0:26:15 lr: 0.000063 grad: 0.1357 (0.1465) loss: 0.8059 (0.8013) time: 0.2688 data: 0.0002 max mem: 26157 Train: [52] [ 700/6250] eta: 0:25:45 lr: 0.000063 grad: 0.1332 (0.1453) loss: 0.7981 (0.8010) time: 0.2810 data: 0.0002 max mem: 26157 Train: [52] [ 800/6250] eta: 0:28:39 lr: 0.000063 grad: 0.1315 (0.1439) loss: 0.7962 (0.8004) time: 0.4388 data: 0.1648 max mem: 26157 Train: [52] [ 900/6250] eta: 0:27:42 lr: 0.000063 grad: 0.1278 (0.1432) loss: 0.8007 (0.8002) time: 0.2697 data: 0.0002 max mem: 26157 Train: [52] [1000/6250] eta: 0:26:50 lr: 0.000063 grad: 0.1311 (0.1434) loss: 0.8010 (0.8000) time: 0.2737 data: 0.0002 max mem: 26157 Train: [52] [1100/6250] eta: 0:26:47 lr: 0.000063 grad: 0.1280 (0.1431) loss: 0.8014 (0.7998) time: 0.2710 data: 0.0002 max mem: 26157 Train: [52] [1200/6250] eta: 0:25:59 lr: 0.000063 grad: 0.1338 (0.1426) loss: 0.7957 (0.7998) time: 0.2720 data: 0.0002 max mem: 26157 Train: [52] [1300/6250] eta: 0:25:14 lr: 0.000063 grad: 0.1353 (0.1429) loss: 0.7938 (0.7995) time: 0.2726 data: 0.0002 max mem: 26157 Train: [52] [1400/6250] eta: 0:24:31 lr: 0.000063 grad: 0.1201 (0.1423) loss: 0.7973 (0.7991) time: 0.2706 data: 0.0002 max mem: 26157 Train: [52] [1500/6250] eta: 0:23:51 lr: 0.000063 grad: 0.1315 (0.1427) loss: 0.7995 (0.7988) time: 0.2718 data: 0.0002 max mem: 26157 Train: [52] [1600/6250] eta: 0:23:39 lr: 0.000063 grad: 0.1336 (0.1432) loss: 0.7941 (0.7985) time: 0.2699 data: 0.0002 max mem: 26157 Train: [52] [1700/6250] eta: 0:22:59 lr: 0.000063 grad: 0.1419 (0.1438) loss: 0.7980 (0.7984) time: 0.2721 data: 0.0002 max mem: 26157 Train: [52] [1800/6250] eta: 0:22:49 lr: 0.000063 grad: 0.1278 (0.1442) loss: 0.7975 (0.7982) time: 0.8397 data: 0.5642 max mem: 26157 Train: [52] [1900/6250] eta: 0:22:10 lr: 0.000063 grad: 0.1346 (0.1439) loss: 0.7927 (0.7982) time: 0.2688 data: 0.0002 max mem: 26157 Train: [52] [2000/6250] eta: 0:21:32 lr: 0.000063 grad: 0.1305 (0.1441) loss: 0.7940 (0.7981) time: 0.2753 data: 0.0002 max mem: 26157 Train: [52] [2100/6250] eta: 0:20:55 lr: 0.000063 grad: 0.1525 (0.1450) loss: 0.7911 (0.7978) time: 0.2699 data: 0.0002 max mem: 26157 Train: [52] [2200/6250] eta: 0:20:19 lr: 0.000063 grad: 0.1294 (0.1450) loss: 0.7953 (0.7976) time: 0.2739 data: 0.0003 max mem: 26157 Train: [52] [2300/6250] eta: 0:20:02 lr: 0.000063 grad: 0.1298 (0.1448) loss: 0.7986 (0.7975) time: 0.2680 data: 0.0001 max mem: 26157 Train: [52] [2400/6250] eta: 0:19:26 lr: 0.000063 grad: 0.1311 (0.1454) loss: 0.7970 (0.7973) time: 0.2708 data: 0.0002 max mem: 26157 Train: [52] [2500/6250] eta: 0:18:51 lr: 0.000063 grad: 0.1539 (0.1456) loss: 0.7828 (0.7972) time: 0.2689 data: 0.0002 max mem: 26157 Train: [52] [2600/6250] eta: 0:18:16 lr: 0.000063 grad: 0.1356 (0.1456) loss: 0.7879 (0.7969) time: 0.2698 data: 0.0002 max mem: 26157 Train: [52] [2700/6250] eta: 0:17:42 lr: 0.000063 grad: 0.1288 (0.1454) loss: 0.7919 (0.7968) time: 0.2685 data: 0.0002 max mem: 26157 Train: [52] [2800/6250] eta: 0:17:08 lr: 0.000063 grad: 0.1293 (0.1455) loss: 0.7922 (0.7966) time: 0.2690 data: 0.0002 max mem: 26157 Train: [52] [2900/6250] eta: 0:16:35 lr: 0.000063 grad: 0.1383 (0.1458) loss: 0.7893 (0.7964) time: 0.2703 data: 0.0002 max mem: 26157 Train: [52] [3000/6250] eta: 0:16:03 lr: 0.000063 grad: 0.1258 (0.1460) loss: 0.7934 (0.7962) time: 0.2708 data: 0.0002 max mem: 26157 Train: [52] [3100/6250] eta: 0:15:31 lr: 0.000063 grad: 0.1329 (0.1460) loss: 0.8041 (0.7962) time: 0.2699 data: 0.0002 max mem: 26157 Train: [52] [3200/6250] eta: 0:14:58 lr: 0.000062 grad: 0.1362 (0.1461) loss: 0.7888 (0.7961) time: 0.2692 data: 0.0002 max mem: 26157 Train: [52] [3300/6250] eta: 0:14:28 lr: 0.000062 grad: 0.1321 (0.1459) loss: 0.7871 (0.7960) time: 0.2724 data: 0.0002 max mem: 26157 Train: [52] [3400/6250] eta: 0:13:56 lr: 0.000062 grad: 0.1255 (0.1458) loss: 0.7927 (0.7959) time: 0.2694 data: 0.0002 max mem: 26157 Train: [52] [3500/6250] eta: 0:13:25 lr: 0.000062 grad: 0.1427 (0.1456) loss: 0.7926 (0.7958) time: 0.2706 data: 0.0002 max mem: 26157 Train: [52] [3600/6250] eta: 0:12:54 lr: 0.000062 grad: 0.1228 (0.1453) loss: 0.7938 (0.7957) time: 0.2698 data: 0.0002 max mem: 26157 Train: [52] [3700/6250] eta: 0:12:24 lr: 0.000062 grad: 0.1238 (0.1450) loss: 0.7932 (0.7956) time: 0.2696 data: 0.0002 max mem: 26157 Train: [52] [3800/6250] eta: 0:11:53 lr: 0.000062 grad: 0.1201 (0.1447) loss: 0.7995 (0.7956) time: 0.2697 data: 0.0002 max mem: 26157 Train: [52] [3900/6250] eta: 0:11:23 lr: 0.000062 grad: 0.1438 (0.1445) loss: 0.7972 (0.7956) time: 0.2693 data: 0.0002 max mem: 26157 Train: [52] [4000/6250] eta: 0:10:52 lr: 0.000062 grad: 0.1258 (0.1442) loss: 0.7915 (0.7956) time: 0.2686 data: 0.0001 max mem: 26157 Train: [52] [4100/6250] eta: 0:10:22 lr: 0.000062 grad: 0.1256 (0.1440) loss: 0.7981 (0.7956) time: 0.2670 data: 0.0002 max mem: 26157 Train: [52] [4200/6250] eta: 0:09:52 lr: 0.000062 grad: 0.1297 (0.1438) loss: 0.7934 (0.7955) time: 0.2724 data: 0.0003 max mem: 26157 Train: [52] [4300/6250] eta: 0:09:23 lr: 0.000062 grad: 0.1276 (0.1435) loss: 0.7915 (0.7954) time: 0.2726 data: 0.0002 max mem: 26157 Train: [52] [4400/6250] eta: 0:08:53 lr: 0.000062 grad: 0.1296 (0.1433) loss: 0.7975 (0.7954) time: 0.2717 data: 0.0002 max mem: 26157 Train: [52] [4500/6250] eta: 0:08:23 lr: 0.000062 grad: 0.1299 (0.1430) loss: 0.7925 (0.7954) time: 0.2700 data: 0.0002 max mem: 26157 Train: [52] [4600/6250] eta: 0:07:54 lr: 0.000062 grad: 0.1246 (0.1429) loss: 0.7896 (0.7954) time: 0.2686 data: 0.0002 max mem: 26157 Train: [52] [4700/6250] eta: 0:07:25 lr: 0.000062 grad: 0.1325 (0.1430) loss: 0.7925 (0.7955) time: 0.2719 data: 0.0002 max mem: 26157 Train: [52] [4800/6250] eta: 0:06:55 lr: 0.000062 grad: 0.1266 (0.1432) loss: 0.7989 (0.7956) time: 0.2713 data: 0.0002 max mem: 26157 Train: [52] [4900/6250] eta: 0:06:26 lr: 0.000062 grad: 0.1373 (0.1433) loss: 0.7939 (0.7956) time: 0.2677 data: 0.0002 max mem: 26157 Train: [52] [5000/6250] eta: 0:05:57 lr: 0.000062 grad: 0.1350 (0.1435) loss: 0.7998 (0.7957) time: 0.2713 data: 0.0002 max mem: 26157 Train: [52] [5100/6250] eta: 0:05:28 lr: 0.000062 grad: 0.1234 (0.1434) loss: 0.7999 (0.7957) time: 0.2700 data: 0.0002 max mem: 26157 Train: [52] [5200/6250] eta: 0:05:00 lr: 0.000062 grad: 0.1328 (0.1434) loss: 0.7983 (0.7957) time: 0.2710 data: 0.0002 max mem: 26157 Train: [52] [5300/6250] eta: 0:04:33 lr: 0.000062 grad: 0.1340 (0.1433) loss: 0.7908 (0.7957) time: 0.2700 data: 0.0002 max mem: 26157 Train: [52] [5400/6250] eta: 0:04:04 lr: 0.000062 grad: 0.1201 (0.1432) loss: 0.8034 (0.7958) time: 0.2716 data: 0.0001 max mem: 26157 Train: [52] [5500/6250] eta: 0:03:35 lr: 0.000062 grad: 0.1303 (0.1431) loss: 0.7978 (0.7958) time: 0.2724 data: 0.0002 max mem: 26157 Train: [52] [5600/6250] eta: 0:03:07 lr: 0.000062 grad: 0.1293 (0.1430) loss: 0.7964 (0.7958) time: 0.2715 data: 0.0002 max mem: 26157 Train: [52] [5700/6250] eta: 0:02:38 lr: 0.000062 grad: 0.1200 (0.1429) loss: 0.8036 (0.7958) time: 0.2720 data: 0.0002 max mem: 26157 Train: [52] [5800/6250] eta: 0:02:09 lr: 0.000062 grad: 0.1299 (0.1428) loss: 0.7980 (0.7959) time: 0.2722 data: 0.0002 max mem: 26157 Train: [52] [5900/6250] eta: 0:01:40 lr: 0.000062 grad: 0.1320 (0.1428) loss: 0.7976 (0.7959) time: 0.2711 data: 0.0002 max mem: 26157 Train: [52] [6000/6250] eta: 0:01:11 lr: 0.000062 grad: 0.1291 (0.1426) loss: 0.8009 (0.7960) time: 0.2711 data: 0.0002 max mem: 26157 Train: [52] [6100/6250] eta: 0:00:43 lr: 0.000062 grad: 0.1296 (0.1424) loss: 0.8008 (0.7961) time: 0.2757 data: 0.0002 max mem: 26157 Train: [52] [6200/6250] eta: 0:00:14 lr: 0.000061 grad: 0.1301 (0.1424) loss: 0.8003 (0.7961) time: 0.4998 data: 0.2279 max mem: 26157 Train: [52] [6249/6250] eta: 0:00:00 lr: 0.000061 grad: 0.1298 (0.1425) loss: 0.8001 (0.7961) time: 0.2719 data: 0.0002 max mem: 26157 Train: [52] Total time: 0:30:06 (0.2891 s / it) Averaged stats: lr: 0.000061 grad: 0.1298 (0.1425) loss: 0.8001 (0.7961) Eval (hcp-train-subset): [52] [ 0/62] eta: 0:05:08 loss: 0.8337 (0.8337) time: 4.9682 data: 4.8847 max mem: 26157 Eval (hcp-train-subset): [52] [61/62] eta: 0:00:00 loss: 0.8184 (0.8204) time: 0.1376 data: 0.0546 max mem: 26157 Eval (hcp-train-subset): [52] Total time: 0:00:13 (0.2163 s / it) Averaged stats (hcp-train-subset): loss: 0.8184 (0.8204) Making plots (hcp-train-subset): example=38 Eval (hcp-val): [52] [ 0/62] eta: 0:03:16 loss: 0.8197 (0.8197) time: 3.1741 data: 3.0883 max mem: 26157 Eval (hcp-val): [52] [61/62] eta: 0:00:00 loss: 0.8242 (0.8258) time: 0.1333 data: 0.0507 max mem: 26157 Eval (hcp-val): [52] Total time: 0:00:13 (0.2103 s / it) Averaged stats (hcp-val): loss: 0.8242 (0.8258) Making plots (hcp-val): example=39 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [53] [ 0/6250] eta: 11:49:09 lr: 0.000061 grad: 0.1262 (0.1262) loss: 0.8527 (0.8527) time: 6.8080 data: 6.5301 max mem: 26157 Train: [53] [ 100/6250] eta: 0:35:08 lr: 0.000061 grad: 0.1294 (0.1828) loss: 0.8045 (0.8172) time: 0.2737 data: 0.0002 max mem: 26157 Train: [53] [ 200/6250] eta: 0:30:58 lr: 0.000061 grad: 0.1303 (0.1684) loss: 0.8061 (0.8089) time: 0.2730 data: 0.0002 max mem: 26157 Train: [53] [ 300/6250] eta: 0:29:16 lr: 0.000061 grad: 0.1247 (0.1574) loss: 0.7971 (0.8069) time: 0.2703 data: 0.0002 max mem: 26157 Train: [53] [ 400/6250] eta: 0:28:10 lr: 0.000061 grad: 0.1219 (0.1542) loss: 0.8043 (0.8060) time: 0.2708 data: 0.0002 max mem: 26157 Train: [53] [ 500/6250] eta: 0:27:19 lr: 0.000061 grad: 0.1311 (0.1505) loss: 0.8041 (0.8054) time: 0.2682 data: 0.0002 max mem: 26157 Train: [53] [ 600/6250] eta: 0:26:39 lr: 0.000061 grad: 0.1370 (0.1504) loss: 0.7966 (0.8044) time: 0.2777 data: 0.0002 max mem: 26157 Train: [53] [ 700/6250] eta: 0:27:16 lr: 0.000061 grad: 0.1387 (0.1503) loss: 0.7931 (0.8035) time: 0.6934 data: 0.4149 max mem: 26157 Train: [53] [ 800/6250] eta: 0:26:31 lr: 0.000061 grad: 0.1337 (0.1514) loss: 0.7862 (0.8025) time: 0.2708 data: 0.0002 max mem: 26157 Train: [53] [ 900/6250] eta: 0:26:10 lr: 0.000061 grad: 0.1211 (0.1503) loss: 0.7952 (0.8022) time: 0.4300 data: 0.1563 max mem: 26157 Train: [53] [1000/6250] eta: 0:25:29 lr: 0.000061 grad: 0.1370 (0.1502) loss: 0.7964 (0.8013) time: 0.2698 data: 0.0002 max mem: 26157 Train: [53] [1100/6250] eta: 0:25:05 lr: 0.000061 grad: 0.1393 (0.1489) loss: 0.7824 (0.8003) time: 0.4261 data: 0.1467 max mem: 26157 Train: [53] [1200/6250] eta: 0:25:53 lr: 0.000061 grad: 0.1296 (0.1483) loss: 0.7930 (0.7996) time: 0.2705 data: 0.0002 max mem: 26157 Train: [53] [1300/6250] eta: 0:25:08 lr: 0.000061 grad: 0.1355 (0.1485) loss: 0.7881 (0.7990) time: 0.2695 data: 0.0002 max mem: 26157 Train: [53] [1400/6250] eta: 0:24:25 lr: 0.000061 grad: 0.1328 (0.1484) loss: 0.7945 (0.7984) time: 0.2707 data: 0.0002 max mem: 26157 Train: [53] [1500/6250] eta: 0:23:45 lr: 0.000061 grad: 0.1373 (0.1500) loss: 0.7847 (0.7978) time: 0.2715 data: 0.0002 max mem: 26157 Train: [53] [1600/6250] eta: 0:23:07 lr: 0.000061 grad: 0.1290 (0.1500) loss: 0.7895 (0.7973) time: 0.2714 data: 0.0002 max mem: 26157 Train: [53] [1700/6250] eta: 0:22:30 lr: 0.000061 grad: 0.1268 (0.1499) loss: 0.7887 (0.7969) time: 0.2686 data: 0.0002 max mem: 26157 Train: [53] [1800/6250] eta: 0:21:53 lr: 0.000061 grad: 0.1311 (0.1502) loss: 0.7924 (0.7965) time: 0.2699 data: 0.0002 max mem: 26157 Train: [53] [1900/6250] eta: 0:21:18 lr: 0.000061 grad: 0.1414 (0.1507) loss: 0.7923 (0.7962) time: 0.2692 data: 0.0002 max mem: 26157 Train: [53] [2000/6250] eta: 0:20:44 lr: 0.000061 grad: 0.1319 (0.1509) loss: 0.7888 (0.7960) time: 0.2700 data: 0.0002 max mem: 26157 Train: [53] [2100/6250] eta: 0:20:10 lr: 0.000061 grad: 0.1371 (0.1510) loss: 0.7881 (0.7957) time: 0.2699 data: 0.0002 max mem: 26157 Train: [53] [2200/6250] eta: 0:19:37 lr: 0.000061 grad: 0.1246 (0.1508) loss: 0.7931 (0.7955) time: 0.2700 data: 0.0002 max mem: 26157 Train: [53] [2300/6250] eta: 0:19:04 lr: 0.000061 grad: 0.1352 (0.1506) loss: 0.7896 (0.7954) time: 0.2701 data: 0.0002 max mem: 26157 Train: [53] [2400/6250] eta: 0:18:33 lr: 0.000061 grad: 0.1274 (0.1508) loss: 0.7928 (0.7952) time: 0.2702 data: 0.0002 max mem: 26157 Train: [53] [2500/6250] eta: 0:18:01 lr: 0.000061 grad: 0.1285 (0.1509) loss: 0.7919 (0.7949) time: 0.2715 data: 0.0002 max mem: 26157 Train: [53] [2600/6250] eta: 0:17:41 lr: 0.000061 grad: 0.1252 (0.1508) loss: 0.7926 (0.7947) time: 0.2710 data: 0.0002 max mem: 26157 Train: [53] [2700/6250] eta: 0:17:09 lr: 0.000061 grad: 0.1321 (0.1506) loss: 0.7969 (0.7947) time: 0.2694 data: 0.0002 max mem: 26157 Train: [53] [2800/6250] eta: 0:16:39 lr: 0.000061 grad: 0.1283 (0.1506) loss: 0.7974 (0.7946) time: 0.2686 data: 0.0002 max mem: 26157 Train: [53] [2900/6250] eta: 0:16:08 lr: 0.000061 grad: 0.1389 (0.1508) loss: 0.7898 (0.7945) time: 0.2696 data: 0.0002 max mem: 26157 Train: [53] [3000/6250] eta: 0:15:38 lr: 0.000060 grad: 0.1222 (0.1505) loss: 0.7958 (0.7944) time: 0.2771 data: 0.0002 max mem: 26157 Train: [53] [3100/6250] eta: 0:15:15 lr: 0.000060 grad: 0.1278 (0.1507) loss: 0.7905 (0.7943) time: 0.2759 data: 0.0004 max mem: 26157 Train: [53] [3200/6250] eta: 0:14:44 lr: 0.000060 grad: 0.1258 (0.1504) loss: 0.7983 (0.7943) time: 0.2711 data: 0.0002 max mem: 26157 Train: [53] [3300/6250] eta: 0:14:21 lr: 0.000060 grad: 0.1317 (0.1505) loss: 0.7956 (0.7943) time: 0.2686 data: 0.0002 max mem: 26157 Train: [53] [3400/6250] eta: 0:13:50 lr: 0.000060 grad: 0.1301 (0.1501) loss: 0.7985 (0.7944) time: 0.2769 data: 0.0002 max mem: 26157 Train: [53] [3500/6250] eta: 0:13:19 lr: 0.000060 grad: 0.1409 (0.1503) loss: 0.7930 (0.7945) time: 0.2699 data: 0.0002 max mem: 26157 Train: [53] [3600/6250] eta: 0:13:04 lr: 0.000060 grad: 0.1372 (0.1502) loss: 0.7943 (0.7945) time: 0.2737 data: 0.0002 max mem: 26157 Train: [53] [3700/6250] eta: 0:12:33 lr: 0.000060 grad: 0.1172 (0.1501) loss: 0.7946 (0.7945) time: 0.2792 data: 0.0002 max mem: 26157 Train: [53] [3800/6250] eta: 0:12:07 lr: 0.000060 grad: 0.1436 (0.1501) loss: 0.7897 (0.7945) time: 0.5775 data: 0.3019 max mem: 26157 Train: [53] [3900/6250] eta: 0:11:36 lr: 0.000060 grad: 0.1369 (0.1502) loss: 0.7929 (0.7945) time: 0.2739 data: 0.0002 max mem: 26157 Train: [53] [4000/6250] eta: 0:11:05 lr: 0.000060 grad: 0.1255 (0.1503) loss: 0.7942 (0.7944) time: 0.2693 data: 0.0003 max mem: 26157 Train: [53] [4100/6250] eta: 0:10:34 lr: 0.000060 grad: 0.1330 (0.1501) loss: 0.7895 (0.7944) time: 0.2671 data: 0.0002 max mem: 26157 Train: [53] [4200/6250] eta: 0:10:03 lr: 0.000060 grad: 0.1275 (0.1502) loss: 0.7987 (0.7944) time: 0.2683 data: 0.0002 max mem: 26157 Train: [53] [4300/6250] eta: 0:09:33 lr: 0.000060 grad: 0.1311 (0.1499) loss: 0.7934 (0.7944) time: 0.2712 data: 0.0002 max mem: 26157 Train: [53] [4400/6250] eta: 0:09:02 lr: 0.000060 grad: 0.1500 (0.1500) loss: 0.7970 (0.7944) time: 0.2677 data: 0.0002 max mem: 26157 Train: [53] [4500/6250] eta: 0:08:32 lr: 0.000060 grad: 0.1305 (0.1499) loss: 0.7900 (0.7944) time: 0.2695 data: 0.0002 max mem: 26157 Train: [53] [4600/6250] eta: 0:08:02 lr: 0.000060 grad: 0.1295 (0.1498) loss: 0.7891 (0.7944) time: 0.2701 data: 0.0002 max mem: 26157 Train: [53] [4700/6250] eta: 0:07:32 lr: 0.000060 grad: 0.1366 (0.1498) loss: 0.7919 (0.7943) time: 0.2703 data: 0.0002 max mem: 26157 Train: [53] [4800/6250] eta: 0:07:02 lr: 0.000060 grad: 0.1398 (0.1498) loss: 0.7936 (0.7943) time: 0.2690 data: 0.0002 max mem: 26157 Train: [53] [4900/6250] eta: 0:06:32 lr: 0.000060 grad: 0.1389 (0.1499) loss: 0.7840 (0.7942) time: 0.2682 data: 0.0002 max mem: 26157 Train: [53] [5000/6250] eta: 0:06:03 lr: 0.000060 grad: 0.1424 (0.1505) loss: 0.7833 (0.7940) time: 0.2700 data: 0.0002 max mem: 26157 Train: [53] [5100/6250] eta: 0:05:33 lr: 0.000060 grad: 0.1348 (0.1505) loss: 0.7908 (0.7939) time: 0.2686 data: 0.0002 max mem: 26157 Train: [53] [5200/6250] eta: 0:05:04 lr: 0.000060 grad: 0.1313 (0.1504) loss: 0.7886 (0.7938) time: 0.2699 data: 0.0002 max mem: 26157 Train: [53] [5300/6250] eta: 0:04:34 lr: 0.000060 grad: 0.1443 (0.1505) loss: 0.7820 (0.7937) time: 0.2680 data: 0.0002 max mem: 26157 Train: [53] [5400/6250] eta: 0:04:05 lr: 0.000060 grad: 0.1254 (0.1504) loss: 0.7866 (0.7937) time: 0.2711 data: 0.0002 max mem: 26157 Train: [53] [5500/6250] eta: 0:03:36 lr: 0.000060 grad: 0.1247 (0.1503) loss: 0.7902 (0.7937) time: 0.4412 data: 0.1704 max mem: 26157 Train: [53] [5600/6250] eta: 0:03:07 lr: 0.000060 grad: 0.1398 (0.1504) loss: 0.7963 (0.7937) time: 0.2680 data: 0.0001 max mem: 26157 Train: [53] [5700/6250] eta: 0:02:38 lr: 0.000060 grad: 0.1349 (0.1505) loss: 0.7949 (0.7936) time: 0.2707 data: 0.0002 max mem: 26157 Train: [53] [5800/6250] eta: 0:02:09 lr: 0.000060 grad: 0.1348 (0.1505) loss: 0.7887 (0.7936) time: 0.2714 data: 0.0002 max mem: 26157 Train: [53] [5900/6250] eta: 0:01:40 lr: 0.000060 grad: 0.1322 (0.1504) loss: 0.7933 (0.7935) time: 0.2700 data: 0.0002 max mem: 26157 Train: [53] [6000/6250] eta: 0:01:11 lr: 0.000059 grad: 0.1343 (0.1505) loss: 0.7907 (0.7935) time: 0.2700 data: 0.0002 max mem: 26157 Train: [53] [6100/6250] eta: 0:00:43 lr: 0.000059 grad: 0.1311 (0.1506) loss: 0.7884 (0.7935) time: 0.2726 data: 0.0002 max mem: 26157 Train: [53] [6200/6250] eta: 0:00:14 lr: 0.000059 grad: 0.1322 (0.1505) loss: 0.7961 (0.7934) time: 0.2680 data: 0.0002 max mem: 26157 Train: [53] [6249/6250] eta: 0:00:00 lr: 0.000059 grad: 0.1318 (0.1505) loss: 0.7901 (0.7934) time: 0.2694 data: 0.0002 max mem: 26157 Train: [53] Total time: 0:29:59 (0.2880 s / it) Averaged stats: lr: 0.000059 grad: 0.1318 (0.1505) loss: 0.7901 (0.7934) Eval (hcp-train-subset): [53] [ 0/62] eta: 0:06:02 loss: 0.8361 (0.8361) time: 5.8547 data: 5.7717 max mem: 26157 Eval (hcp-train-subset): [53] [61/62] eta: 0:00:00 loss: 0.8195 (0.8211) time: 0.1156 data: 0.0307 max mem: 26157 Eval (hcp-train-subset): [53] Total time: 0:00:13 (0.2115 s / it) Averaged stats (hcp-train-subset): loss: 0.8195 (0.8211) Making plots (hcp-train-subset): example=43 Eval (hcp-val): [53] [ 0/62] eta: 0:04:42 loss: 0.8225 (0.8225) time: 4.5593 data: 4.4638 max mem: 26157 Eval (hcp-val): [53] [61/62] eta: 0:00:00 loss: 0.8244 (0.8258) time: 0.1169 data: 0.0337 max mem: 26157 Eval (hcp-val): [53] Total time: 0:00:13 (0.2149 s / it) Averaged stats (hcp-val): loss: 0.8244 (0.8258) Making plots (hcp-val): example=23 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [54] [ 0/6250] eta: 9:51:52 lr: 0.000059 grad: 0.1924 (0.1924) loss: 0.8357 (0.8357) time: 5.6820 data: 5.4022 max mem: 26157 Train: [54] [ 100/6250] eta: 0:33:52 lr: 0.000059 grad: 0.1393 (0.1632) loss: 0.7875 (0.8005) time: 0.2705 data: 0.0002 max mem: 26157 Train: [54] [ 200/6250] eta: 0:30:23 lr: 0.000059 grad: 0.1247 (0.1525) loss: 0.8051 (0.7980) time: 0.2708 data: 0.0003 max mem: 26157 Train: [54] [ 300/6250] eta: 0:31:31 lr: 0.000059 grad: 0.1261 (0.1482) loss: 0.7985 (0.7984) time: 0.6551 data: 0.3400 max mem: 26157 Train: [54] [ 400/6250] eta: 0:29:58 lr: 0.000059 grad: 0.1352 (0.1473) loss: 0.8035 (0.7974) time: 0.2773 data: 0.0002 max mem: 26157 Train: [54] [ 500/6250] eta: 0:29:12 lr: 0.000059 grad: 0.1218 (0.1462) loss: 0.8012 (0.7975) time: 0.2708 data: 0.0002 max mem: 26157 Train: [54] [ 600/6250] eta: 0:28:21 lr: 0.000059 grad: 0.1395 (0.1474) loss: 0.8015 (0.7976) time: 0.2699 data: 0.0002 max mem: 26157 Train: [54] [ 700/6250] eta: 0:27:30 lr: 0.000059 grad: 0.1297 (0.1477) loss: 0.8025 (0.7973) time: 0.2705 data: 0.0002 max mem: 26157 Train: [54] [ 800/6250] eta: 0:27:43 lr: 0.000059 grad: 0.1278 (0.1471) loss: 0.7967 (0.7976) time: 0.4012 data: 0.1213 max mem: 26157 Train: [54] [ 900/6250] eta: 0:26:56 lr: 0.000059 grad: 0.1172 (0.1458) loss: 0.8045 (0.7975) time: 0.2677 data: 0.0002 max mem: 26157 Train: [54] [1000/6250] eta: 0:26:46 lr: 0.000059 grad: 0.1266 (0.1445) loss: 0.8006 (0.7975) time: 0.2694 data: 0.0002 max mem: 26157 Train: [54] [1100/6250] eta: 0:27:11 lr: 0.000059 grad: 0.1180 (0.1444) loss: 0.8023 (0.7977) time: 0.2684 data: 0.0002 max mem: 26157 Train: [54] [1200/6250] eta: 0:26:20 lr: 0.000059 grad: 0.1267 (0.1441) loss: 0.7990 (0.7977) time: 0.2701 data: 0.0002 max mem: 26157 Train: [54] [1300/6250] eta: 0:25:32 lr: 0.000059 grad: 0.1329 (0.1438) loss: 0.7943 (0.7977) time: 0.2694 data: 0.0002 max mem: 26157 Train: [54] [1400/6250] eta: 0:24:47 lr: 0.000059 grad: 0.1297 (0.1434) loss: 0.8002 (0.7978) time: 0.2694 data: 0.0002 max mem: 26157 Train: [54] [1500/6250] eta: 0:24:18 lr: 0.000059 grad: 0.1282 (0.1437) loss: 0.7994 (0.7978) time: 0.4743 data: 0.1976 max mem: 26157 Train: [54] [1600/6250] eta: 0:23:37 lr: 0.000059 grad: 0.1249 (0.1438) loss: 0.7940 (0.7976) time: 0.2697 data: 0.0002 max mem: 26157 Train: [54] [1700/6250] eta: 0:22:57 lr: 0.000059 grad: 0.1272 (0.1437) loss: 0.7978 (0.7974) time: 0.2680 data: 0.0002 max mem: 26157 Train: [54] [1800/6250] eta: 0:22:19 lr: 0.000059 grad: 0.1276 (0.1440) loss: 0.7967 (0.7973) time: 0.2692 data: 0.0002 max mem: 26157 Train: [54] [1900/6250] eta: 0:21:42 lr: 0.000059 grad: 0.1285 (0.1446) loss: 0.7954 (0.7971) time: 0.2709 data: 0.0002 max mem: 26157 Train: [54] [2000/6250] eta: 0:21:05 lr: 0.000059 grad: 0.1300 (0.1456) loss: 0.7902 (0.7969) time: 0.2689 data: 0.0002 max mem: 26157 Train: [54] [2100/6250] eta: 0:20:30 lr: 0.000059 grad: 0.1440 (0.1464) loss: 0.7901 (0.7969) time: 0.2689 data: 0.0002 max mem: 26157 Train: [54] [2200/6250] eta: 0:19:56 lr: 0.000059 grad: 0.1283 (0.1462) loss: 0.7936 (0.7967) time: 0.2741 data: 0.0003 max mem: 26157 Train: [54] [2300/6250] eta: 0:19:22 lr: 0.000059 grad: 0.1339 (0.1464) loss: 0.7904 (0.7966) time: 0.2688 data: 0.0002 max mem: 26157 Train: [54] [2400/6250] eta: 0:18:48 lr: 0.000059 grad: 0.1356 (0.1467) loss: 0.7906 (0.7965) time: 0.2700 data: 0.0002 max mem: 26157 Train: [54] [2500/6250] eta: 0:18:16 lr: 0.000059 grad: 0.1221 (0.1472) loss: 0.7928 (0.7963) time: 0.2680 data: 0.0002 max mem: 26157 Train: [54] [2600/6250] eta: 0:17:44 lr: 0.000059 grad: 0.1294 (0.1472) loss: 0.7925 (0.7961) time: 0.2725 data: 0.0002 max mem: 26157 Train: [54] [2700/6250] eta: 0:17:12 lr: 0.000059 grad: 0.1284 (0.1471) loss: 0.7951 (0.7960) time: 0.2687 data: 0.0002 max mem: 26157 Train: [54] [2800/6250] eta: 0:16:40 lr: 0.000058 grad: 0.1220 (0.1469) loss: 0.8005 (0.7959) time: 0.2704 data: 0.0002 max mem: 26157 Train: [54] [2900/6250] eta: 0:16:09 lr: 0.000058 grad: 0.1310 (0.1474) loss: 0.7951 (0.7959) time: 0.2690 data: 0.0003 max mem: 26157 Train: [54] [3000/6250] eta: 0:15:38 lr: 0.000058 grad: 0.1290 (0.1475) loss: 0.8004 (0.7959) time: 0.2705 data: 0.0002 max mem: 26157 Train: [54] [3100/6250] eta: 0:15:07 lr: 0.000058 grad: 0.1236 (0.1477) loss: 0.7932 (0.7959) time: 0.2689 data: 0.0002 max mem: 26157 Train: [54] [3200/6250] eta: 0:14:36 lr: 0.000058 grad: 0.1240 (0.1476) loss: 0.8000 (0.7959) time: 0.2698 data: 0.0002 max mem: 26157 Train: [54] [3300/6250] eta: 0:14:06 lr: 0.000058 grad: 0.1280 (0.1478) loss: 0.7971 (0.7958) time: 0.2722 data: 0.0002 max mem: 26157 Train: [54] [3400/6250] eta: 0:13:36 lr: 0.000058 grad: 0.1231 (0.1477) loss: 0.8043 (0.7959) time: 0.2712 data: 0.0002 max mem: 26157 Train: [54] [3500/6250] eta: 0:13:07 lr: 0.000058 grad: 0.1285 (0.1478) loss: 0.7919 (0.7959) time: 0.2768 data: 0.0002 max mem: 26157 Train: [54] [3600/6250] eta: 0:12:38 lr: 0.000058 grad: 0.1370 (0.1478) loss: 0.7946 (0.7959) time: 0.2695 data: 0.0002 max mem: 26157 Train: [54] [3700/6250] eta: 0:12:08 lr: 0.000058 grad: 0.1294 (0.1478) loss: 0.7963 (0.7959) time: 0.2715 data: 0.0002 max mem: 26157 Train: [54] [3800/6250] eta: 0:11:38 lr: 0.000058 grad: 0.1260 (0.1476) loss: 0.7920 (0.7959) time: 0.2724 data: 0.0002 max mem: 26157 Train: [54] [3900/6250] eta: 0:11:09 lr: 0.000058 grad: 0.1244 (0.1476) loss: 0.7999 (0.7959) time: 0.2728 data: 0.0002 max mem: 26157 Train: [54] [4000/6250] eta: 0:10:43 lr: 0.000058 grad: 0.1330 (0.1476) loss: 0.7979 (0.7960) time: 0.2696 data: 0.0002 max mem: 26157 Train: [54] [4100/6250] eta: 0:10:13 lr: 0.000058 grad: 0.1312 (0.1477) loss: 0.7886 (0.7960) time: 0.2686 data: 0.0002 max mem: 26157 Train: [54] [4200/6250] eta: 0:09:44 lr: 0.000058 grad: 0.1283 (0.1478) loss: 0.7955 (0.7960) time: 0.2763 data: 0.0002 max mem: 26157 Train: [54] [4300/6250] eta: 0:09:15 lr: 0.000058 grad: 0.1302 (0.1481) loss: 0.7940 (0.7960) time: 0.2687 data: 0.0002 max mem: 26157 Train: [54] [4400/6250] eta: 0:08:46 lr: 0.000058 grad: 0.1449 (0.1499) loss: 0.7968 (0.7960) time: 0.2713 data: 0.0002 max mem: 26157 Train: [54] [4500/6250] eta: 0:08:17 lr: 0.000058 grad: 0.1391 (0.1502) loss: 0.7927 (0.7959) time: 0.2704 data: 0.0002 max mem: 26157 Train: [54] [4600/6250] eta: 0:07:48 lr: 0.000058 grad: 0.1317 (0.1504) loss: 0.7992 (0.7959) time: 0.2694 data: 0.0002 max mem: 26157 Train: [54] [4700/6250] eta: 0:07:19 lr: 0.000058 grad: 0.1421 (0.1506) loss: 0.7908 (0.7959) time: 0.2733 data: 0.0002 max mem: 26157 Train: [54] [4800/6250] eta: 0:06:50 lr: 0.000058 grad: 0.1398 (0.1509) loss: 0.8047 (0.7960) time: 0.2725 data: 0.0002 max mem: 26157 Train: [54] [4900/6250] eta: 0:06:22 lr: 0.000058 grad: 0.1265 (0.1508) loss: 0.8013 (0.7960) time: 0.2698 data: 0.0002 max mem: 26157 Train: [54] [5000/6250] eta: 0:05:53 lr: 0.000058 grad: 0.1252 (0.1512) loss: 0.8000 (0.7960) time: 0.2683 data: 0.0002 max mem: 26157 Train: [54] [5100/6250] eta: 0:05:26 lr: 0.000058 grad: 0.1363 (0.1509) loss: 0.7946 (0.7960) time: 0.2730 data: 0.0002 max mem: 26157 Train: [54] [5200/6250] eta: 0:04:58 lr: 0.000058 grad: 0.1338 (0.1508) loss: 0.7945 (0.7960) time: 0.2709 data: 0.0002 max mem: 26157 Train: [54] [5300/6250] eta: 0:04:29 lr: 0.000058 grad: 0.1421 (0.1509) loss: 0.7985 (0.7959) time: 0.3238 data: 0.0458 max mem: 26157 Train: [54] [5400/6250] eta: 0:04:01 lr: 0.000058 grad: 0.1346 (0.1510) loss: 0.7954 (0.7959) time: 0.2691 data: 0.0002 max mem: 26157 Train: [54] [5500/6250] eta: 0:03:33 lr: 0.000058 grad: 0.1302 (0.1512) loss: 0.7919 (0.7959) time: 0.2712 data: 0.0001 max mem: 26157 Train: [54] [5600/6250] eta: 0:03:04 lr: 0.000058 grad: 0.1297 (0.1512) loss: 0.7943 (0.7959) time: 0.2708 data: 0.0002 max mem: 26157 Train: [54] [5700/6250] eta: 0:02:35 lr: 0.000058 grad: 0.1343 (0.1517) loss: 0.7987 (0.7959) time: 0.2694 data: 0.0002 max mem: 26157 Train: [54] [5800/6250] eta: 0:02:07 lr: 0.000057 grad: 0.1402 (0.1520) loss: 0.7994 (0.7959) time: 0.2694 data: 0.0002 max mem: 26157 Train: [54] [5900/6250] eta: 0:01:39 lr: 0.000057 grad: 0.1311 (0.1519) loss: 0.8052 (0.7960) time: 0.2690 data: 0.0002 max mem: 26157 Train: [54] [6000/6250] eta: 0:01:10 lr: 0.000057 grad: 0.1258 (0.1519) loss: 0.7966 (0.7960) time: 0.2691 data: 0.0001 max mem: 26157 Train: [54] [6100/6250] eta: 0:00:42 lr: 0.000057 grad: 0.1281 (0.1519) loss: 0.7971 (0.7960) time: 0.2686 data: 0.0002 max mem: 26157 Train: [54] [6200/6250] eta: 0:00:14 lr: 0.000057 grad: 0.1502 (0.1518) loss: 0.7876 (0.7960) time: 0.2702 data: 0.0002 max mem: 26157 Train: [54] [6249/6250] eta: 0:00:00 lr: 0.000057 grad: 0.1353 (0.1519) loss: 0.7933 (0.7960) time: 0.2690 data: 0.0002 max mem: 26157 Train: [54] Total time: 0:29:30 (0.2833 s / it) Averaged stats: lr: 0.000057 grad: 0.1353 (0.1519) loss: 0.7933 (0.7960) Eval (hcp-train-subset): [54] [ 0/62] eta: 0:05:18 loss: 0.8347 (0.8347) time: 5.1423 data: 5.0550 max mem: 26157 Eval (hcp-train-subset): [54] [61/62] eta: 0:00:00 loss: 0.8195 (0.8213) time: 0.1208 data: 0.0359 max mem: 26157 Eval (hcp-train-subset): [54] Total time: 0:00:12 (0.2062 s / it) Averaged stats (hcp-train-subset): loss: 0.8195 (0.8213) Making plots (hcp-train-subset): example=60 Eval (hcp-val): [54] [ 0/62] eta: 0:03:28 loss: 0.8194 (0.8194) time: 3.3557 data: 3.2471 max mem: 26157 Eval (hcp-val): [54] [61/62] eta: 0:00:00 loss: 0.8249 (0.8251) time: 0.1282 data: 0.0436 max mem: 26157 Eval (hcp-val): [54] Total time: 0:00:13 (0.2155 s / it) Averaged stats (hcp-val): loss: 0.8249 (0.8251) Making plots (hcp-val): example=59 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [55] [ 0/6250] eta: 8:40:06 lr: 0.000057 grad: 0.3015 (0.3015) loss: 0.8035 (0.8035) time: 4.9931 data: 4.6598 max mem: 26157 Train: [55] [ 100/6250] eta: 0:33:53 lr: 0.000057 grad: 0.1576 (0.2045) loss: 0.7975 (0.8085) time: 0.2710 data: 0.0002 max mem: 26157 Train: [55] [ 200/6250] eta: 0:30:18 lr: 0.000057 grad: 0.1415 (0.1822) loss: 0.7985 (0.8035) time: 0.2698 data: 0.0002 max mem: 26157 Train: [55] [ 300/6250] eta: 0:28:46 lr: 0.000057 grad: 0.1331 (0.1716) loss: 0.8109 (0.8030) time: 0.2698 data: 0.0002 max mem: 26157 Train: [55] [ 400/6250] eta: 0:27:48 lr: 0.000057 grad: 0.1340 (0.1644) loss: 0.7940 (0.8024) time: 0.2725 data: 0.0002 max mem: 26157 Train: [55] [ 500/6250] eta: 0:27:02 lr: 0.000057 grad: 0.1255 (0.1584) loss: 0.8121 (0.8019) time: 0.2693 data: 0.0002 max mem: 26157 Train: [55] [ 600/6250] eta: 0:26:21 lr: 0.000057 grad: 0.1277 (0.1567) loss: 0.7919 (0.8013) time: 0.2675 data: 0.0002 max mem: 26157 Train: [55] [ 700/6250] eta: 0:25:45 lr: 0.000057 grad: 0.1194 (0.1536) loss: 0.7989 (0.8012) time: 0.2707 data: 0.0002 max mem: 26157 Train: [55] [ 800/6250] eta: 0:25:12 lr: 0.000057 grad: 0.1334 (0.1526) loss: 0.8000 (0.8009) time: 0.2722 data: 0.0002 max mem: 26157 Train: [55] [ 900/6250] eta: 0:24:40 lr: 0.000057 grad: 0.1244 (0.1516) loss: 0.8006 (0.8008) time: 0.2704 data: 0.0003 max mem: 26157 Train: [55] [1000/6250] eta: 0:24:09 lr: 0.000057 grad: 0.1261 (0.1497) loss: 0.8018 (0.8005) time: 0.2692 data: 0.0002 max mem: 26157 Train: [55] [1100/6250] eta: 0:23:38 lr: 0.000057 grad: 0.1229 (0.1480) loss: 0.8016 (0.8005) time: 0.2724 data: 0.0003 max mem: 26157 Train: [55] [1200/6250] eta: 0:23:09 lr: 0.000057 grad: 0.1238 (0.1470) loss: 0.7958 (0.8004) time: 0.2734 data: 0.0002 max mem: 26157 Train: [55] [1300/6250] eta: 0:22:40 lr: 0.000057 grad: 0.1222 (0.1462) loss: 0.8026 (0.8004) time: 0.2726 data: 0.0002 max mem: 26157 Train: [55] [1400/6250] eta: 0:22:29 lr: 0.000057 grad: 0.1282 (0.1458) loss: 0.7960 (0.8002) time: 0.2742 data: 0.0002 max mem: 26157 Train: [55] [1500/6250] eta: 0:22:22 lr: 0.000057 grad: 0.1275 (0.1452) loss: 0.7892 (0.8000) time: 0.2713 data: 0.0002 max mem: 26157 Train: [55] [1600/6250] eta: 0:21:51 lr: 0.000057 grad: 0.1259 (0.1453) loss: 0.7904 (0.7998) time: 0.2789 data: 0.0003 max mem: 26157 Train: [55] [1700/6250] eta: 0:21:48 lr: 0.000057 grad: 0.1343 (0.1457) loss: 0.7944 (0.7995) time: 0.3086 data: 0.0341 max mem: 26157 Train: [55] [1800/6250] eta: 0:21:15 lr: 0.000057 grad: 0.1237 (0.1455) loss: 0.7985 (0.7992) time: 0.2692 data: 0.0002 max mem: 26157 Train: [55] [1900/6250] eta: 0:20:43 lr: 0.000057 grad: 0.1325 (0.1455) loss: 0.7900 (0.7989) time: 0.2711 data: 0.0002 max mem: 26157 Train: [55] [2000/6250] eta: 0:20:11 lr: 0.000057 grad: 0.1306 (0.1459) loss: 0.7905 (0.7985) time: 0.2774 data: 0.0002 max mem: 26157 Train: [55] [2100/6250] eta: 0:19:51 lr: 0.000057 grad: 0.1256 (0.1459) loss: 0.7901 (0.7982) time: 0.5129 data: 0.2352 max mem: 26157 Train: [55] [2200/6250] eta: 0:19:32 lr: 0.000057 grad: 0.1246 (0.1461) loss: 0.7886 (0.7980) time: 0.2715 data: 0.0002 max mem: 26157 Train: [55] [2300/6250] eta: 0:19:00 lr: 0.000057 grad: 0.1421 (0.1460) loss: 0.7865 (0.7976) time: 0.2705 data: 0.0002 max mem: 26157 Train: [55] [2400/6250] eta: 0:18:28 lr: 0.000057 grad: 0.1308 (0.1458) loss: 0.7925 (0.7975) time: 0.2694 data: 0.0002 max mem: 26157 Train: [55] [2500/6250] eta: 0:17:56 lr: 0.000057 grad: 0.1278 (0.1457) loss: 0.8004 (0.7975) time: 0.2687 data: 0.0002 max mem: 26157 Train: [55] [2600/6250] eta: 0:17:25 lr: 0.000056 grad: 0.1282 (0.1454) loss: 0.7980 (0.7975) time: 0.2680 data: 0.0001 max mem: 26157 Train: [55] [2700/6250] eta: 0:16:54 lr: 0.000056 grad: 0.1473 (0.1457) loss: 0.7939 (0.7974) time: 0.2692 data: 0.0002 max mem: 26157 Train: [55] [2800/6250] eta: 0:16:23 lr: 0.000056 grad: 0.2013 (0.1482) loss: 0.8000 (0.7974) time: 0.2694 data: 0.0002 max mem: 26157 Train: [55] [2900/6250] eta: 0:15:53 lr: 0.000056 grad: 0.1249 (0.1483) loss: 0.7941 (0.7972) time: 0.2718 data: 0.0002 max mem: 26157 Train: [55] [3000/6250] eta: 0:15:23 lr: 0.000056 grad: 0.1719 (0.1489) loss: 0.7927 (0.7971) time: 0.2689 data: 0.0002 max mem: 26157 Train: [55] [3100/6250] eta: 0:14:53 lr: 0.000056 grad: 0.1653 (0.1502) loss: 0.7806 (0.7968) time: 0.2709 data: 0.0002 max mem: 26157 Train: [55] [3200/6250] eta: 0:14:24 lr: 0.000056 grad: 0.1352 (0.1519) loss: 0.7928 (0.7966) time: 0.2696 data: 0.0002 max mem: 26157 Train: [55] [3300/6250] eta: 0:14:02 lr: 0.000056 grad: 0.1550 (0.1533) loss: 0.7925 (0.7964) time: 0.2714 data: 0.0002 max mem: 26157 Train: [55] [3400/6250] eta: 0:13:32 lr: 0.000056 grad: 0.1305 (0.1538) loss: 0.7913 (0.7963) time: 0.2702 data: 0.0002 max mem: 26157 Train: [55] [3500/6250] eta: 0:13:02 lr: 0.000056 grad: 0.1324 (0.1535) loss: 0.7939 (0.7962) time: 0.2683 data: 0.0002 max mem: 26157 Train: [55] [3600/6250] eta: 0:12:32 lr: 0.000056 grad: 0.1596 (0.1540) loss: 0.7879 (0.7960) time: 0.2693 data: 0.0002 max mem: 26157 Train: [55] [3700/6250] eta: 0:12:03 lr: 0.000056 grad: 0.1428 (0.1546) loss: 0.7959 (0.7958) time: 0.2687 data: 0.0002 max mem: 26157 Train: [55] [3800/6250] eta: 0:11:34 lr: 0.000056 grad: 0.1465 (0.1546) loss: 0.7878 (0.7957) time: 0.2682 data: 0.0002 max mem: 26157 Train: [55] [3900/6250] eta: 0:11:04 lr: 0.000056 grad: 0.1375 (0.1547) loss: 0.7970 (0.7957) time: 0.2687 data: 0.0002 max mem: 26157 Train: [55] [4000/6250] eta: 0:10:35 lr: 0.000056 grad: 0.1289 (0.1548) loss: 0.7956 (0.7955) time: 0.2706 data: 0.0002 max mem: 26157 Train: [55] [4100/6250] eta: 0:10:06 lr: 0.000056 grad: 0.1354 (0.1548) loss: 0.7949 (0.7955) time: 0.2717 data: 0.0002 max mem: 26157 Train: [55] [4200/6250] eta: 0:09:38 lr: 0.000056 grad: 0.1349 (0.1545) loss: 0.7962 (0.7954) time: 0.2698 data: 0.0002 max mem: 26157 Train: [55] [4300/6250] eta: 0:09:12 lr: 0.000056 grad: 0.1313 (0.1543) loss: 0.7910 (0.7954) time: 0.2705 data: 0.0002 max mem: 26157 Train: [55] [4400/6250] eta: 0:08:43 lr: 0.000056 grad: 0.1456 (0.1548) loss: 0.7890 (0.7953) time: 0.2738 data: 0.0002 max mem: 26157 Train: [55] [4500/6250] eta: 0:08:15 lr: 0.000056 grad: 0.1337 (0.1550) loss: 0.7956 (0.7952) time: 0.2755 data: 0.0002 max mem: 26157 Train: [55] [4600/6250] eta: 0:07:46 lr: 0.000056 grad: 0.1472 (0.1548) loss: 0.7942 (0.7951) time: 0.2682 data: 0.0001 max mem: 26157 Train: [55] [4700/6250] eta: 0:07:18 lr: 0.000056 grad: 0.1326 (0.1545) loss: 0.7902 (0.7950) time: 0.2681 data: 0.0002 max mem: 26157 Train: [55] [4800/6250] eta: 0:06:49 lr: 0.000056 grad: 0.1318 (0.1543) loss: 0.7929 (0.7950) time: 0.2700 data: 0.0002 max mem: 26157 Train: [55] [4900/6250] eta: 0:06:20 lr: 0.000056 grad: 0.1392 (0.1541) loss: 0.7843 (0.7949) time: 0.2726 data: 0.0002 max mem: 26157 Train: [55] [5000/6250] eta: 0:05:52 lr: 0.000056 grad: 0.1425 (0.1538) loss: 0.7799 (0.7949) time: 0.2705 data: 0.0002 max mem: 26157 Train: [55] [5100/6250] eta: 0:05:24 lr: 0.000056 grad: 0.1303 (0.1537) loss: 0.7939 (0.7948) time: 0.2734 data: 0.0002 max mem: 26157 Train: [55] [5200/6250] eta: 0:04:56 lr: 0.000056 grad: 0.1288 (0.1536) loss: 0.7941 (0.7948) time: 0.2734 data: 0.0002 max mem: 26157 Train: [55] [5300/6250] eta: 0:04:27 lr: 0.000056 grad: 0.1316 (0.1537) loss: 0.7960 (0.7948) time: 0.2731 data: 0.0002 max mem: 26157 Train: [55] [5400/6250] eta: 0:04:00 lr: 0.000056 grad: 0.1516 (0.1536) loss: 0.7909 (0.7947) time: 0.5075 data: 0.2238 max mem: 26157 Train: [55] [5500/6250] eta: 0:03:32 lr: 0.000056 grad: 0.1459 (0.1536) loss: 0.7999 (0.7947) time: 0.2761 data: 0.0002 max mem: 26157 Train: [55] [5600/6250] eta: 0:03:03 lr: 0.000055 grad: 0.1395 (0.1534) loss: 0.7986 (0.7947) time: 0.2704 data: 0.0002 max mem: 26157 Train: [55] [5700/6250] eta: 0:02:36 lr: 0.000055 grad: 0.1348 (0.1537) loss: 0.7966 (0.7947) time: 0.2724 data: 0.0002 max mem: 26157 Train: [55] [5800/6250] eta: 0:02:07 lr: 0.000055 grad: 0.1534 (0.1536) loss: 0.7955 (0.7947) time: 0.2709 data: 0.0002 max mem: 26157 Train: [55] [5900/6250] eta: 0:01:39 lr: 0.000055 grad: 0.1382 (0.1536) loss: 0.8025 (0.7948) time: 0.2685 data: 0.0002 max mem: 26157 Train: [55] [6000/6250] eta: 0:01:10 lr: 0.000055 grad: 0.1314 (0.1536) loss: 0.8019 (0.7948) time: 0.2686 data: 0.0001 max mem: 26157 Train: [55] [6100/6250] eta: 0:00:42 lr: 0.000055 grad: 0.1358 (0.1536) loss: 0.7974 (0.7949) time: 0.2688 data: 0.0002 max mem: 26157 Train: [55] [6200/6250] eta: 0:00:14 lr: 0.000055 grad: 0.1347 (0.1534) loss: 0.7933 (0.7950) time: 0.2684 data: 0.0002 max mem: 26157 Train: [55] [6249/6250] eta: 0:00:00 lr: 0.000055 grad: 0.1325 (0.1534) loss: 0.8038 (0.7950) time: 0.2723 data: 0.0002 max mem: 26157 Train: [55] Total time: 0:29:33 (0.2838 s / it) Averaged stats: lr: 0.000055 grad: 0.1325 (0.1534) loss: 0.8038 (0.7950) Eval (hcp-train-subset): [55] [ 0/62] eta: 0:05:32 loss: 0.8294 (0.8294) time: 5.3578 data: 5.2728 max mem: 26157 Eval (hcp-train-subset): [55] [61/62] eta: 0:00:00 loss: 0.8171 (0.8192) time: 0.1394 data: 0.0567 max mem: 26157 Eval (hcp-train-subset): [55] Total time: 0:00:13 (0.2114 s / it) Averaged stats (hcp-train-subset): loss: 0.8171 (0.8192) Making plots (hcp-train-subset): example=42 Eval (hcp-val): [55] [ 0/62] eta: 0:03:37 loss: 0.8190 (0.8190) time: 3.5055 data: 3.4009 max mem: 26157 Eval (hcp-val): [55] [61/62] eta: 0:00:00 loss: 0.8237 (0.8248) time: 0.1394 data: 0.0542 max mem: 26157 Eval (hcp-val): [55] Total time: 0:00:13 (0.2179 s / it) Averaged stats (hcp-val): loss: 0.8237 (0.8248) Making plots (hcp-val): example=54 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [56] [ 0/6250] eta: 9:42:35 lr: 0.000055 grad: 0.2590 (0.2590) loss: 0.8266 (0.8266) time: 5.5930 data: 5.2986 max mem: 26157 Train: [56] [ 100/6250] eta: 0:33:52 lr: 0.000055 grad: 0.1565 (0.1534) loss: 0.8120 (0.8250) time: 0.2690 data: 0.0002 max mem: 26157 Train: [56] [ 200/6250] eta: 0:30:18 lr: 0.000055 grad: 0.1613 (0.1529) loss: 0.7987 (0.8175) time: 0.2704 data: 0.0002 max mem: 26157 Train: [56] [ 300/6250] eta: 0:28:51 lr: 0.000055 grad: 0.1403 (0.1499) loss: 0.7991 (0.8137) time: 0.2713 data: 0.0002 max mem: 26157 Train: [56] [ 400/6250] eta: 0:27:52 lr: 0.000055 grad: 0.1394 (0.1511) loss: 0.8006 (0.8105) time: 0.2694 data: 0.0002 max mem: 26157 Train: [56] [ 500/6250] eta: 0:27:05 lr: 0.000055 grad: 0.1360 (0.1526) loss: 0.7947 (0.8078) time: 0.2683 data: 0.0002 max mem: 26157 Train: [56] [ 600/6250] eta: 0:26:25 lr: 0.000055 grad: 0.1390 (0.1519) loss: 0.7921 (0.8057) time: 0.2679 data: 0.0002 max mem: 26157 Train: [56] [ 700/6250] eta: 0:26:08 lr: 0.000055 grad: 0.1524 (0.1515) loss: 0.7985 (0.8040) time: 0.2696 data: 0.0002 max mem: 26157 Train: [56] [ 800/6250] eta: 0:25:32 lr: 0.000055 grad: 0.1206 (0.1508) loss: 0.7986 (0.8034) time: 0.2726 data: 0.0002 max mem: 26157 Train: [56] [ 900/6250] eta: 0:25:29 lr: 0.000055 grad: 0.1385 (0.1495) loss: 0.8009 (0.8029) time: 0.2692 data: 0.0002 max mem: 26157 Train: [56] [1000/6250] eta: 0:25:35 lr: 0.000055 grad: 0.1339 (0.1500) loss: 0.7940 (0.8024) time: 0.2744 data: 0.0002 max mem: 26157 Train: [56] [1100/6250] eta: 0:24:56 lr: 0.000055 grad: 0.1347 (0.1499) loss: 0.7900 (0.8019) time: 0.2687 data: 0.0002 max mem: 26157 Train: [56] [1200/6250] eta: 0:24:18 lr: 0.000055 grad: 0.1271 (0.1492) loss: 0.7954 (0.8016) time: 0.2687 data: 0.0002 max mem: 26157 Train: [56] [1300/6250] eta: 0:23:41 lr: 0.000055 grad: 0.1313 (0.1490) loss: 0.7967 (0.8013) time: 0.2687 data: 0.0002 max mem: 26157 Train: [56] [1400/6250] eta: 0:23:06 lr: 0.000055 grad: 0.1374 (0.1486) loss: 0.7905 (0.8008) time: 0.2682 data: 0.0002 max mem: 26157 Train: [56] [1500/6250] eta: 0:22:32 lr: 0.000055 grad: 0.1338 (0.1486) loss: 0.7902 (0.8004) time: 0.2713 data: 0.0002 max mem: 26157 Train: [56] [1600/6250] eta: 0:22:00 lr: 0.000055 grad: 0.1244 (0.1481) loss: 0.7953 (0.8001) time: 0.2710 data: 0.0002 max mem: 26157 Train: [56] [1700/6250] eta: 0:21:28 lr: 0.000055 grad: 0.1299 (0.1491) loss: 0.7938 (0.7998) time: 0.2697 data: 0.0002 max mem: 26157 Train: [56] [1800/6250] eta: 0:20:56 lr: 0.000055 grad: 0.1261 (0.1496) loss: 0.7951 (0.7994) time: 0.2698 data: 0.0002 max mem: 26157 Train: [56] [1900/6250] eta: 0:20:25 lr: 0.000055 grad: 0.1386 (0.1495) loss: 0.7874 (0.7991) time: 0.2703 data: 0.0002 max mem: 26157 Train: [56] [2000/6250] eta: 0:19:54 lr: 0.000055 grad: 0.1273 (0.1490) loss: 0.7934 (0.7988) time: 0.2710 data: 0.0002 max mem: 26157 Train: [56] [2100/6250] eta: 0:19:26 lr: 0.000055 grad: 0.1318 (0.1491) loss: 0.7958 (0.7985) time: 0.2697 data: 0.0002 max mem: 26157 Train: [56] [2200/6250] eta: 0:18:56 lr: 0.000055 grad: 0.1371 (0.1497) loss: 0.7878 (0.7982) time: 0.2688 data: 0.0002 max mem: 26157 Train: [56] [2300/6250] eta: 0:18:26 lr: 0.000055 grad: 0.1363 (0.1491) loss: 0.7952 (0.7980) time: 0.2691 data: 0.0002 max mem: 26157 Train: [56] [2400/6250] eta: 0:17:56 lr: 0.000054 grad: 0.1330 (0.1490) loss: 0.7948 (0.7977) time: 0.2700 data: 0.0002 max mem: 26157 Train: [56] [2500/6250] eta: 0:17:26 lr: 0.000054 grad: 0.1522 (0.1495) loss: 0.7981 (0.7974) time: 0.2684 data: 0.0002 max mem: 26157 Train: [56] [2600/6250] eta: 0:16:57 lr: 0.000054 grad: 0.1342 (0.1495) loss: 0.7927 (0.7972) time: 0.2706 data: 0.0002 max mem: 26157 Train: [56] [2700/6250] eta: 0:16:28 lr: 0.000054 grad: 0.1440 (0.1499) loss: 0.7985 (0.7971) time: 0.2693 data: 0.0002 max mem: 26157 Train: [56] [2800/6250] eta: 0:15:59 lr: 0.000054 grad: 0.1317 (0.1500) loss: 0.8008 (0.7970) time: 0.2705 data: 0.0002 max mem: 26157 Train: [56] [2900/6250] eta: 0:15:30 lr: 0.000054 grad: 0.1376 (0.1502) loss: 0.7911 (0.7969) time: 0.2681 data: 0.0002 max mem: 26157 Train: [56] [3000/6250] eta: 0:15:02 lr: 0.000054 grad: 0.1486 (0.1505) loss: 0.7920 (0.7967) time: 0.2687 data: 0.0002 max mem: 26157 Train: [56] [3100/6250] eta: 0:14:33 lr: 0.000054 grad: 0.1469 (0.1506) loss: 0.7884 (0.7967) time: 0.2712 data: 0.0002 max mem: 26157 Train: [56] [3200/6250] eta: 0:14:07 lr: 0.000054 grad: 0.1285 (0.1503) loss: 0.7969 (0.7965) time: 0.2692 data: 0.0002 max mem: 26157 Train: [56] [3300/6250] eta: 0:13:40 lr: 0.000054 grad: 0.1353 (0.1500) loss: 0.7914 (0.7964) time: 0.2690 data: 0.0002 max mem: 26157 Train: [56] [3400/6250] eta: 0:13:15 lr: 0.000054 grad: 0.1362 (0.1498) loss: 0.7941 (0.7963) time: 0.4716 data: 0.1993 max mem: 26157 Train: [56] [3500/6250] eta: 0:12:46 lr: 0.000054 grad: 0.1323 (0.1502) loss: 0.7985 (0.7962) time: 0.2680 data: 0.0002 max mem: 26157 Train: [56] [3600/6250] eta: 0:12:26 lr: 0.000054 grad: 0.1464 (0.1503) loss: 0.7906 (0.7961) time: 0.4573 data: 0.1788 max mem: 26157 Train: [56] [3700/6250] eta: 0:11:57 lr: 0.000054 grad: 0.1355 (0.1508) loss: 0.7914 (0.7959) time: 0.2846 data: 0.0003 max mem: 26157 Train: [56] [3800/6250] eta: 0:11:28 lr: 0.000054 grad: 0.1483 (0.1510) loss: 0.7904 (0.7958) time: 0.2773 data: 0.0003 max mem: 26157 Train: [56] [3900/6250] eta: 0:11:00 lr: 0.000054 grad: 0.1356 (0.1511) loss: 0.7934 (0.7957) time: 0.2703 data: 0.0002 max mem: 26157 Train: [56] [4000/6250] eta: 0:10:32 lr: 0.000054 grad: 0.1343 (0.1510) loss: 0.7946 (0.7956) time: 0.2699 data: 0.0002 max mem: 26157 Train: [56] [4100/6250] eta: 0:10:03 lr: 0.000054 grad: 0.1378 (0.1513) loss: 0.7852 (0.7955) time: 0.2702 data: 0.0002 max mem: 26157 Train: [56] [4200/6250] eta: 0:09:35 lr: 0.000054 grad: 0.1472 (0.1516) loss: 0.7913 (0.7954) time: 0.2702 data: 0.0002 max mem: 26157 Train: [56] [4300/6250] eta: 0:09:06 lr: 0.000054 grad: 0.1349 (0.1517) loss: 0.7915 (0.7952) time: 0.2695 data: 0.0002 max mem: 26157 Train: [56] [4400/6250] eta: 0:08:38 lr: 0.000054 grad: 0.1307 (0.1517) loss: 0.7880 (0.7951) time: 0.2704 data: 0.0002 max mem: 26157 Train: [56] [4500/6250] eta: 0:08:09 lr: 0.000054 grad: 0.1296 (0.1518) loss: 0.7904 (0.7949) time: 0.2697 data: 0.0002 max mem: 26157 Train: [56] [4600/6250] eta: 0:07:41 lr: 0.000054 grad: 0.1344 (0.1517) loss: 0.7857 (0.7948) time: 0.2714 data: 0.0002 max mem: 26157 Train: [56] [4700/6250] eta: 0:07:13 lr: 0.000054 grad: 0.1387 (0.1519) loss: 0.7854 (0.7945) time: 0.2704 data: 0.0002 max mem: 26157 Train: [56] [4800/6250] eta: 0:06:45 lr: 0.000054 grad: 0.1413 (0.1520) loss: 0.7846 (0.7944) time: 0.2699 data: 0.0002 max mem: 26157 Train: [56] [4900/6250] eta: 0:06:17 lr: 0.000054 grad: 0.1454 (0.1520) loss: 0.7856 (0.7943) time: 0.2722 data: 0.0002 max mem: 26157 Train: [56] [5000/6250] eta: 0:05:50 lr: 0.000054 grad: 0.1340 (0.1520) loss: 0.7911 (0.7942) time: 0.2674 data: 0.0002 max mem: 26157 Train: [56] [5100/6250] eta: 0:05:21 lr: 0.000054 grad: 0.1382 (0.1521) loss: 0.7794 (0.7940) time: 0.2708 data: 0.0002 max mem: 26157 Train: [56] [5200/6250] eta: 0:04:53 lr: 0.000054 grad: 0.1340 (0.1522) loss: 0.7881 (0.7939) time: 0.2735 data: 0.0002 max mem: 26157 Train: [56] [5300/6250] eta: 0:04:25 lr: 0.000054 grad: 0.1370 (0.1525) loss: 0.7882 (0.7938) time: 0.2717 data: 0.0002 max mem: 26157 Train: [56] [5400/6250] eta: 0:03:57 lr: 0.000054 grad: 0.1406 (0.1525) loss: 0.7846 (0.7937) time: 0.2717 data: 0.0002 max mem: 26157 Train: [56] [5500/6250] eta: 0:03:29 lr: 0.000053 grad: 0.1379 (0.1526) loss: 0.7938 (0.7937) time: 0.2679 data: 0.0002 max mem: 26157 Train: [56] [5600/6250] eta: 0:03:01 lr: 0.000053 grad: 0.1380 (0.1530) loss: 0.7878 (0.7936) time: 0.2727 data: 0.0002 max mem: 26157 Train: [56] [5700/6250] eta: 0:02:33 lr: 0.000053 grad: 0.1382 (0.1528) loss: 0.7864 (0.7936) time: 0.2708 data: 0.0002 max mem: 26157 Train: [56] [5800/6250] eta: 0:02:05 lr: 0.000053 grad: 0.1314 (0.1528) loss: 0.7889 (0.7935) time: 0.2709 data: 0.0002 max mem: 26157 Train: [56] [5900/6250] eta: 0:01:37 lr: 0.000053 grad: 0.1426 (0.1530) loss: 0.7916 (0.7935) time: 0.2702 data: 0.0002 max mem: 26157 Train: [56] [6000/6250] eta: 0:01:09 lr: 0.000053 grad: 0.1313 (0.1530) loss: 0.7922 (0.7935) time: 0.2709 data: 0.0002 max mem: 26157 Train: [56] [6100/6250] eta: 0:00:41 lr: 0.000053 grad: 0.1247 (0.1530) loss: 0.7917 (0.7935) time: 0.2709 data: 0.0002 max mem: 26157 Train: [56] [6200/6250] eta: 0:00:13 lr: 0.000053 grad: 0.1482 (0.1532) loss: 0.7875 (0.7934) time: 0.2721 data: 0.0002 max mem: 26157 Train: [56] [6249/6250] eta: 0:00:00 lr: 0.000053 grad: 0.1355 (0.1532) loss: 0.7942 (0.7934) time: 0.2773 data: 0.0002 max mem: 26157 Train: [56] Total time: 0:29:05 (0.2793 s / it) Averaged stats: lr: 0.000053 grad: 0.1355 (0.1532) loss: 0.7942 (0.7934) Eval (hcp-train-subset): [56] [ 0/62] eta: 0:04:16 loss: 0.8312 (0.8312) time: 4.1324 data: 4.0084 max mem: 26157 Eval (hcp-train-subset): [56] [61/62] eta: 0:00:00 loss: 0.8176 (0.8183) time: 0.1345 data: 0.0499 max mem: 26157 Eval (hcp-train-subset): [56] Total time: 0:00:13 (0.2242 s / it) Averaged stats (hcp-train-subset): loss: 0.8176 (0.8183) Making plots (hcp-train-subset): example=45 Eval (hcp-val): [56] [ 0/62] eta: 0:03:48 loss: 0.8197 (0.8197) time: 3.6789 data: 3.5567 max mem: 26157 Eval (hcp-val): [56] [61/62] eta: 0:00:00 loss: 0.8239 (0.8246) time: 0.1460 data: 0.0630 max mem: 26157 Eval (hcp-val): [56] Total time: 0:00:13 (0.2137 s / it) Averaged stats (hcp-val): loss: 0.8239 (0.8246) Making plots (hcp-val): example=16 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [57] [ 0/6250] eta: 10:33:25 lr: 0.000053 grad: 0.3559 (0.3559) loss: 0.7869 (0.7869) time: 6.0809 data: 5.8051 max mem: 26157 Train: [57] [ 100/6250] eta: 0:34:10 lr: 0.000053 grad: 0.1429 (0.1735) loss: 0.8076 (0.8157) time: 0.2703 data: 0.0002 max mem: 26157 Train: [57] [ 200/6250] eta: 0:30:27 lr: 0.000053 grad: 0.1447 (0.1651) loss: 0.8013 (0.8094) time: 0.2710 data: 0.0002 max mem: 26157 Train: [57] [ 300/6250] eta: 0:28:56 lr: 0.000053 grad: 0.1324 (0.1586) loss: 0.8039 (0.8084) time: 0.2703 data: 0.0002 max mem: 26157 Train: [57] [ 400/6250] eta: 0:27:56 lr: 0.000053 grad: 0.1404 (0.1558) loss: 0.7973 (0.8073) time: 0.2691 data: 0.0002 max mem: 26157 Train: [57] [ 500/6250] eta: 0:27:13 lr: 0.000053 grad: 0.1292 (0.1553) loss: 0.7994 (0.8067) time: 0.2780 data: 0.0003 max mem: 26157 Train: [57] [ 600/6250] eta: 0:26:35 lr: 0.000053 grad: 0.1511 (0.1563) loss: 0.7950 (0.8058) time: 0.2712 data: 0.0003 max mem: 26157 Train: [57] [ 700/6250] eta: 0:27:19 lr: 0.000053 grad: 0.1593 (0.1639) loss: 0.8017 (0.8050) time: 0.2772 data: 0.0002 max mem: 26157 Train: [57] [ 800/6250] eta: 0:26:36 lr: 0.000053 grad: 0.1675 (0.1735) loss: 0.7973 (0.8043) time: 0.2772 data: 0.0003 max mem: 26157 Train: [57] [ 900/6250] eta: 0:26:50 lr: 0.000053 grad: 0.1388 (0.1751) loss: 0.8040 (0.8038) time: 0.7217 data: 0.4471 max mem: 26157 Train: [57] [1000/6250] eta: 0:26:05 lr: 0.000053 grad: 0.1232 (0.1716) loss: 0.7998 (0.8032) time: 0.2755 data: 0.0002 max mem: 26157 Train: [57] [1100/6250] eta: 0:25:22 lr: 0.000053 grad: 0.1305 (0.1688) loss: 0.8015 (0.8028) time: 0.2716 data: 0.0002 max mem: 26157 Train: [57] [1200/6250] eta: 0:25:17 lr: 0.000053 grad: 0.1328 (0.1664) loss: 0.7948 (0.8024) time: 0.2699 data: 0.0002 max mem: 26157 Train: [57] [1300/6250] eta: 0:24:36 lr: 0.000053 grad: 0.1289 (0.1645) loss: 0.7923 (0.8017) time: 0.2774 data: 0.0002 max mem: 26157 Train: [57] [1400/6250] eta: 0:24:09 lr: 0.000053 grad: 0.1363 (0.1636) loss: 0.7961 (0.8011) time: 0.2695 data: 0.0003 max mem: 26157 Train: [57] [1500/6250] eta: 0:23:30 lr: 0.000053 grad: 0.1427 (0.1630) loss: 0.7949 (0.8005) time: 0.2706 data: 0.0007 max mem: 26157 Train: [57] [1600/6250] eta: 0:22:55 lr: 0.000053 grad: 0.1471 (0.1640) loss: 0.7887 (0.7999) time: 0.2696 data: 0.0002 max mem: 26157 Train: [57] [1700/6250] eta: 0:22:20 lr: 0.000053 grad: 0.1301 (0.1624) loss: 0.7909 (0.7996) time: 0.2742 data: 0.0002 max mem: 26157 Train: [57] [1800/6250] eta: 0:21:45 lr: 0.000053 grad: 0.1354 (0.1625) loss: 0.7886 (0.7991) time: 0.2716 data: 0.0002 max mem: 26157 Train: [57] [1900/6250] eta: 0:21:22 lr: 0.000053 grad: 0.1399 (0.1614) loss: 0.7903 (0.7987) time: 0.2691 data: 0.0001 max mem: 26157 Train: [57] [2000/6250] eta: 0:20:47 lr: 0.000053 grad: 0.1322 (0.1611) loss: 0.7875 (0.7984) time: 0.2686 data: 0.0002 max mem: 26157 Train: [57] [2100/6250] eta: 0:20:22 lr: 0.000053 grad: 0.1435 (0.1611) loss: 0.7903 (0.7979) time: 0.2696 data: 0.0002 max mem: 26157 Train: [57] [2200/6250] eta: 0:19:55 lr: 0.000053 grad: 0.1512 (0.1621) loss: 0.7929 (0.7975) time: 0.4588 data: 0.1744 max mem: 26157 Train: [57] [2300/6250] eta: 0:19:21 lr: 0.000052 grad: 0.1542 (0.1624) loss: 0.7859 (0.7972) time: 0.2680 data: 0.0002 max mem: 26157 Train: [57] [2400/6250] eta: 0:18:48 lr: 0.000052 grad: 0.1368 (0.1618) loss: 0.7943 (0.7970) time: 0.2679 data: 0.0002 max mem: 26157 Train: [57] [2500/6250] eta: 0:18:15 lr: 0.000052 grad: 0.1347 (0.1614) loss: 0.7910 (0.7967) time: 0.2694 data: 0.0002 max mem: 26157 Train: [57] [2600/6250] eta: 0:17:54 lr: 0.000052 grad: 0.1435 (0.1612) loss: 0.7932 (0.7964) time: 0.6714 data: 0.4000 max mem: 26157 Train: [57] [2700/6250] eta: 0:17:21 lr: 0.000052 grad: 0.1404 (0.1609) loss: 0.7891 (0.7961) time: 0.2683 data: 0.0002 max mem: 26157 Train: [57] [2800/6250] eta: 0:16:54 lr: 0.000052 grad: 0.1365 (0.1609) loss: 0.7881 (0.7959) time: 0.4649 data: 0.1936 max mem: 26157 Train: [57] [2900/6250] eta: 0:16:27 lr: 0.000052 grad: 0.1450 (0.1616) loss: 0.7927 (0.7956) time: 0.2692 data: 0.0002 max mem: 26157 Train: [57] [3000/6250] eta: 0:15:55 lr: 0.000052 grad: 0.1369 (0.1620) loss: 0.7874 (0.7955) time: 0.2710 data: 0.0002 max mem: 26157 Train: [57] [3100/6250] eta: 0:15:28 lr: 0.000052 grad: 0.1483 (0.1620) loss: 0.7795 (0.7953) time: 0.2728 data: 0.0002 max mem: 26157 Train: [57] [3200/6250] eta: 0:14:57 lr: 0.000052 grad: 0.1371 (0.1620) loss: 0.7969 (0.7952) time: 0.2696 data: 0.0002 max mem: 26157 Train: [57] [3300/6250] eta: 0:14:32 lr: 0.000052 grad: 0.1509 (0.1620) loss: 0.7958 (0.7952) time: 0.2716 data: 0.0001 max mem: 26157 Train: [57] [3400/6250] eta: 0:14:00 lr: 0.000052 grad: 0.1396 (0.1616) loss: 0.7946 (0.7951) time: 0.2685 data: 0.0002 max mem: 26157 Train: [57] [3500/6250] eta: 0:13:29 lr: 0.000052 grad: 0.1364 (0.1613) loss: 0.7932 (0.7951) time: 0.2697 data: 0.0002 max mem: 26157 Train: [57] [3600/6250] eta: 0:12:57 lr: 0.000052 grad: 0.1474 (0.1615) loss: 0.7980 (0.7949) time: 0.2681 data: 0.0001 max mem: 26157 Train: [57] [3700/6250] eta: 0:12:26 lr: 0.000052 grad: 0.1482 (0.1618) loss: 0.7833 (0.7948) time: 0.2687 data: 0.0002 max mem: 26157 Train: [57] [3800/6250] eta: 0:11:56 lr: 0.000052 grad: 0.1403 (0.1617) loss: 0.7905 (0.7947) time: 0.2708 data: 0.0002 max mem: 26157 Train: [57] [3900/6250] eta: 0:11:25 lr: 0.000052 grad: 0.1383 (0.1615) loss: 0.7952 (0.7945) time: 0.2690 data: 0.0003 max mem: 26157 Train: [57] [4000/6250] eta: 0:10:55 lr: 0.000052 grad: 0.1404 (0.1614) loss: 0.7911 (0.7944) time: 0.2681 data: 0.0002 max mem: 26157 Train: [57] [4100/6250] eta: 0:10:24 lr: 0.000052 grad: 0.1431 (0.1616) loss: 0.7931 (0.7943) time: 0.2686 data: 0.0002 max mem: 26157 Train: [57] [4200/6250] eta: 0:09:54 lr: 0.000052 grad: 0.1534 (0.1620) loss: 0.7886 (0.7942) time: 0.2708 data: 0.0002 max mem: 26157 Train: [57] [4300/6250] eta: 0:09:24 lr: 0.000052 grad: 0.1432 (0.1618) loss: 0.7918 (0.7941) time: 0.2688 data: 0.0001 max mem: 26157 Train: [57] [4400/6250] eta: 0:08:54 lr: 0.000052 grad: 0.1419 (0.1618) loss: 0.7859 (0.7940) time: 0.2690 data: 0.0002 max mem: 26157 Train: [57] [4500/6250] eta: 0:08:25 lr: 0.000052 grad: 0.1394 (0.1615) loss: 0.7872 (0.7939) time: 0.2682 data: 0.0002 max mem: 26157 Train: [57] [4600/6250] eta: 0:07:55 lr: 0.000052 grad: 0.1506 (0.1615) loss: 0.7986 (0.7938) time: 0.2723 data: 0.0002 max mem: 26157 Train: [57] [4700/6250] eta: 0:07:26 lr: 0.000052 grad: 0.1387 (0.1616) loss: 0.7928 (0.7938) time: 0.2722 data: 0.0002 max mem: 26157 Train: [57] [4800/6250] eta: 0:06:57 lr: 0.000052 grad: 0.1396 (0.1617) loss: 0.7927 (0.7938) time: 0.2697 data: 0.0002 max mem: 26157 Train: [57] [4900/6250] eta: 0:06:27 lr: 0.000052 grad: 0.1352 (0.1617) loss: 0.7997 (0.7938) time: 0.2685 data: 0.0002 max mem: 26157 Train: [57] [5000/6250] eta: 0:05:58 lr: 0.000052 grad: 0.1250 (0.1615) loss: 0.7887 (0.7938) time: 0.2727 data: 0.0002 max mem: 26157 Train: [57] [5100/6250] eta: 0:05:29 lr: 0.000052 grad: 0.1339 (0.1616) loss: 0.7979 (0.7938) time: 0.2704 data: 0.0002 max mem: 26157 Train: [57] [5200/6250] eta: 0:05:00 lr: 0.000052 grad: 0.1428 (0.1616) loss: 0.7936 (0.7939) time: 0.2694 data: 0.0002 max mem: 26157 Train: [57] [5300/6250] eta: 0:04:33 lr: 0.000052 grad: 0.1430 (0.1618) loss: 0.7956 (0.7939) time: 0.2704 data: 0.0002 max mem: 26157 Train: [57] [5400/6250] eta: 0:04:04 lr: 0.000051 grad: 0.1423 (0.1622) loss: 0.7950 (0.7939) time: 0.4171 data: 0.1431 max mem: 26157 Train: [57] [5500/6250] eta: 0:03:35 lr: 0.000051 grad: 0.1501 (0.1623) loss: 0.7919 (0.7939) time: 0.2696 data: 0.0002 max mem: 26157 Train: [57] [5600/6250] eta: 0:03:07 lr: 0.000051 grad: 0.1464 (0.1625) loss: 0.7981 (0.7938) time: 0.2745 data: 0.0002 max mem: 26157 Train: [57] [5700/6250] eta: 0:02:38 lr: 0.000051 grad: 0.1410 (0.1627) loss: 0.7995 (0.7938) time: 0.2711 data: 0.0002 max mem: 26157 Train: [57] [5800/6250] eta: 0:02:09 lr: 0.000051 grad: 0.1338 (0.1626) loss: 0.7931 (0.7938) time: 0.2699 data: 0.0002 max mem: 26157 Train: [57] [5900/6250] eta: 0:01:40 lr: 0.000051 grad: 0.1456 (0.1625) loss: 0.7915 (0.7938) time: 0.2703 data: 0.0002 max mem: 26157 Train: [57] [6000/6250] eta: 0:01:11 lr: 0.000051 grad: 0.1314 (0.1626) loss: 0.8007 (0.7939) time: 0.2717 data: 0.0002 max mem: 26157 Train: [57] [6100/6250] eta: 0:00:43 lr: 0.000051 grad: 0.1503 (0.1627) loss: 0.7921 (0.7939) time: 0.2695 data: 0.0002 max mem: 26157 Train: [57] [6200/6250] eta: 0:00:14 lr: 0.000051 grad: 0.1563 (0.1630) loss: 0.7937 (0.7939) time: 0.2701 data: 0.0002 max mem: 26157 Train: [57] [6249/6250] eta: 0:00:00 lr: 0.000051 grad: 0.1417 (0.1630) loss: 0.7903 (0.7939) time: 0.2762 data: 0.0002 max mem: 26157 Train: [57] Total time: 0:30:02 (0.2884 s / it) Averaged stats: lr: 0.000051 grad: 0.1417 (0.1630) loss: 0.7903 (0.7939) Eval (hcp-train-subset): [57] [ 0/62] eta: 0:04:59 loss: 0.8244 (0.8244) time: 4.8374 data: 4.7537 max mem: 26157 Eval (hcp-train-subset): [57] [61/62] eta: 0:00:00 loss: 0.8194 (0.8204) time: 0.1321 data: 0.0493 max mem: 26157 Eval (hcp-train-subset): [57] Total time: 0:00:13 (0.2155 s / it) Averaged stats (hcp-train-subset): loss: 0.8194 (0.8204) Making plots (hcp-train-subset): example=7 Eval (hcp-val): [57] [ 0/62] eta: 0:05:26 loss: 0.8221 (0.8221) time: 5.2623 data: 5.1790 max mem: 26157 Eval (hcp-val): [57] [61/62] eta: 0:00:00 loss: 0.8241 (0.8257) time: 0.1347 data: 0.0495 max mem: 26157 Eval (hcp-val): [57] Total time: 0:00:13 (0.2111 s / it) Averaged stats (hcp-val): loss: 0.8241 (0.8257) Making plots (hcp-val): example=51 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [58] [ 0/6250] eta: 9:27:26 lr: 0.000051 grad: 0.1346 (0.1346) loss: 0.8441 (0.8441) time: 5.4475 data: 5.0757 max mem: 26157 Train: [58] [ 100/6250] eta: 0:34:00 lr: 0.000051 grad: 0.1443 (0.1938) loss: 0.8066 (0.8011) time: 0.2699 data: 0.0002 max mem: 26157 Train: [58] [ 200/6250] eta: 0:30:16 lr: 0.000051 grad: 0.1347 (0.1771) loss: 0.8015 (0.7991) time: 0.2687 data: 0.0002 max mem: 26157 Train: [58] [ 300/6250] eta: 0:28:46 lr: 0.000051 grad: 0.1323 (0.1658) loss: 0.7982 (0.7994) time: 0.2686 data: 0.0002 max mem: 26157 Train: [58] [ 400/6250] eta: 0:27:48 lr: 0.000051 grad: 0.1282 (0.1609) loss: 0.8018 (0.8005) time: 0.2710 data: 0.0002 max mem: 26157 Train: [58] [ 500/6250] eta: 0:27:01 lr: 0.000051 grad: 0.1324 (0.1593) loss: 0.8112 (0.8016) time: 0.2702 data: 0.0002 max mem: 26157 Train: [58] [ 600/6250] eta: 0:26:20 lr: 0.000051 grad: 0.1517 (0.1618) loss: 0.8105 (0.8021) time: 0.2681 data: 0.0001 max mem: 26157 Train: [58] [ 700/6250] eta: 0:25:43 lr: 0.000051 grad: 0.1488 (0.1607) loss: 0.8010 (0.8017) time: 0.2696 data: 0.0002 max mem: 26157 Train: [58] [ 800/6250] eta: 0:25:10 lr: 0.000051 grad: 0.1399 (0.1611) loss: 0.7991 (0.8014) time: 0.2709 data: 0.0002 max mem: 26157 Train: [58] [ 900/6250] eta: 0:24:38 lr: 0.000051 grad: 0.1382 (0.1607) loss: 0.8050 (0.8010) time: 0.2710 data: 0.0002 max mem: 26157 Train: [58] [1000/6250] eta: 0:24:07 lr: 0.000051 grad: 0.1467 (0.1593) loss: 0.7962 (0.8008) time: 0.2714 data: 0.0002 max mem: 26157 Train: [58] [1100/6250] eta: 0:23:38 lr: 0.000051 grad: 0.1402 (0.1587) loss: 0.7885 (0.8005) time: 0.2714 data: 0.0002 max mem: 26157 Train: [58] [1200/6250] eta: 0:23:08 lr: 0.000051 grad: 0.1271 (0.1583) loss: 0.7948 (0.7999) time: 0.2705 data: 0.0002 max mem: 26157 Train: [58] [1300/6250] eta: 0:22:39 lr: 0.000051 grad: 0.1404 (0.1571) loss: 0.8003 (0.7995) time: 0.2732 data: 0.0002 max mem: 26157 Train: [58] [1400/6250] eta: 0:22:10 lr: 0.000051 grad: 0.1324 (0.1575) loss: 0.7940 (0.7990) time: 0.2698 data: 0.0002 max mem: 26157 Train: [58] [1500/6250] eta: 0:21:55 lr: 0.000051 grad: 0.1350 (0.1578) loss: 0.7918 (0.7985) time: 0.4789 data: 0.2005 max mem: 26157 Train: [58] [1600/6250] eta: 0:22:05 lr: 0.000051 grad: 0.1569 (0.1580) loss: 0.7990 (0.7981) time: 0.2719 data: 0.0002 max mem: 26157 Train: [58] [1700/6250] eta: 0:21:33 lr: 0.000051 grad: 0.1306 (0.1574) loss: 0.7975 (0.7978) time: 0.2716 data: 0.0002 max mem: 26157 Train: [58] [1800/6250] eta: 0:21:17 lr: 0.000051 grad: 0.1504 (0.1577) loss: 0.7916 (0.7974) time: 0.5928 data: 0.3163 max mem: 26157 Train: [58] [1900/6250] eta: 0:20:47 lr: 0.000051 grad: 0.1367 (0.1576) loss: 0.7942 (0.7969) time: 0.2701 data: 0.0002 max mem: 26157 Train: [58] [2000/6250] eta: 0:20:15 lr: 0.000051 grad: 0.1393 (0.1574) loss: 0.7938 (0.7966) time: 0.2695 data: 0.0002 max mem: 26157 Train: [58] [2100/6250] eta: 0:19:43 lr: 0.000051 grad: 0.1453 (0.1572) loss: 0.7891 (0.7965) time: 0.2702 data: 0.0001 max mem: 26157 Train: [58] [2200/6250] eta: 0:19:15 lr: 0.000050 grad: 0.1440 (0.1579) loss: 0.7949 (0.7963) time: 0.2768 data: 0.0002 max mem: 26157 Train: [58] [2300/6250] eta: 0:18:45 lr: 0.000050 grad: 0.1420 (0.1577) loss: 0.7978 (0.7961) time: 0.2715 data: 0.0002 max mem: 26157 Train: [58] [2400/6250] eta: 0:18:14 lr: 0.000050 grad: 0.1474 (0.1574) loss: 0.7907 (0.7959) time: 0.2698 data: 0.0002 max mem: 26157 Train: [58] [2500/6250] eta: 0:17:44 lr: 0.000050 grad: 0.1376 (0.1579) loss: 0.7883 (0.7957) time: 0.2685 data: 0.0002 max mem: 26157 Train: [58] [2600/6250] eta: 0:17:13 lr: 0.000050 grad: 0.1339 (0.1576) loss: 0.7932 (0.7954) time: 0.2683 data: 0.0002 max mem: 26157 Train: [58] [2700/6250] eta: 0:16:43 lr: 0.000050 grad: 0.1399 (0.1580) loss: 0.7879 (0.7953) time: 0.2687 data: 0.0003 max mem: 26157 Train: [58] [2800/6250] eta: 0:16:23 lr: 0.000050 grad: 0.1392 (0.1577) loss: 0.7915 (0.7952) time: 0.2699 data: 0.0002 max mem: 26157 Train: [58] [2900/6250] eta: 0:15:52 lr: 0.000050 grad: 0.1420 (0.1582) loss: 0.7858 (0.7950) time: 0.2707 data: 0.0002 max mem: 26157 Train: [58] [3000/6250] eta: 0:15:23 lr: 0.000050 grad: 0.1477 (0.1591) loss: 0.7940 (0.7948) time: 0.2698 data: 0.0002 max mem: 26157 Train: [58] [3100/6250] eta: 0:15:02 lr: 0.000050 grad: 0.1582 (0.1602) loss: 0.7885 (0.7945) time: 0.2705 data: 0.0002 max mem: 26157 Train: [58] [3200/6250] eta: 0:14:33 lr: 0.000050 grad: 0.1589 (0.1608) loss: 0.7902 (0.7943) time: 0.2699 data: 0.0002 max mem: 26157 Train: [58] [3300/6250] eta: 0:14:02 lr: 0.000050 grad: 0.1426 (0.1610) loss: 0.7852 (0.7941) time: 0.2686 data: 0.0002 max mem: 26157 Train: [58] [3400/6250] eta: 0:13:39 lr: 0.000050 grad: 0.1561 (0.1612) loss: 0.7866 (0.7939) time: 0.2693 data: 0.0002 max mem: 26157 Train: [58] [3500/6250] eta: 0:13:09 lr: 0.000050 grad: 0.1617 (0.1614) loss: 0.7822 (0.7937) time: 0.2702 data: 0.0002 max mem: 26157 Train: [58] [3600/6250] eta: 0:12:40 lr: 0.000050 grad: 0.1515 (0.1622) loss: 0.7943 (0.7935) time: 0.3553 data: 0.0828 max mem: 26157 Train: [58] [3700/6250] eta: 0:12:12 lr: 0.000050 grad: 0.1650 (0.1634) loss: 0.7870 (0.7932) time: 0.2712 data: 0.0002 max mem: 26157 Train: [58] [3800/6250] eta: 0:11:44 lr: 0.000050 grad: 0.1549 (0.1641) loss: 0.7862 (0.7931) time: 0.4108 data: 0.1334 max mem: 26157 Train: [58] [3900/6250] eta: 0:11:14 lr: 0.000050 grad: 0.1505 (0.1643) loss: 0.7854 (0.7929) time: 0.2703 data: 0.0002 max mem: 26157 Train: [58] [4000/6250] eta: 0:10:47 lr: 0.000050 grad: 0.1536 (0.1643) loss: 0.7812 (0.7928) time: 0.2729 data: 0.0002 max mem: 26157 Train: [58] [4100/6250] eta: 0:10:21 lr: 0.000050 grad: 0.1399 (0.1644) loss: 0.7894 (0.7926) time: 0.2701 data: 0.0002 max mem: 26157 Train: [58] [4200/6250] eta: 0:09:51 lr: 0.000050 grad: 0.1400 (0.1643) loss: 0.7851 (0.7925) time: 0.2709 data: 0.0003 max mem: 26157 Train: [58] [4300/6250] eta: 0:09:21 lr: 0.000050 grad: 0.1392 (0.1641) loss: 0.7877 (0.7924) time: 0.2706 data: 0.0002 max mem: 26157 Train: [58] [4400/6250] eta: 0:08:53 lr: 0.000050 grad: 0.1372 (0.1641) loss: 0.7906 (0.7922) time: 0.2698 data: 0.0002 max mem: 26157 Train: [58] [4500/6250] eta: 0:08:24 lr: 0.000050 grad: 0.1527 (0.1641) loss: 0.7840 (0.7921) time: 0.2724 data: 0.0002 max mem: 26157 Train: [58] [4600/6250] eta: 0:07:54 lr: 0.000050 grad: 0.1423 (0.1638) loss: 0.7942 (0.7921) time: 0.2708 data: 0.0002 max mem: 26157 Train: [58] [4700/6250] eta: 0:07:26 lr: 0.000050 grad: 0.1416 (0.1641) loss: 0.7861 (0.7921) time: 0.2710 data: 0.0002 max mem: 26157 Train: [58] [4800/6250] eta: 0:07:00 lr: 0.000050 grad: 0.1330 (0.1647) loss: 0.7961 (0.7921) time: 0.2729 data: 0.0002 max mem: 26157 Train: [58] [4900/6250] eta: 0:06:30 lr: 0.000050 grad: 0.1526 (0.1646) loss: 0.7855 (0.7920) time: 0.3112 data: 0.0361 max mem: 26157 Train: [58] [5000/6250] eta: 0:06:03 lr: 0.000050 grad: 0.1402 (0.1646) loss: 0.7865 (0.7920) time: 0.2714 data: 0.0002 max mem: 26157 Train: [58] [5100/6250] eta: 0:05:33 lr: 0.000050 grad: 0.1336 (0.1643) loss: 0.7946 (0.7919) time: 0.2736 data: 0.0002 max mem: 26157 Train: [58] [5200/6250] eta: 0:05:04 lr: 0.000050 grad: 0.1506 (0.1642) loss: 0.7906 (0.7919) time: 0.2952 data: 0.0243 max mem: 26157 Train: [58] [5300/6250] eta: 0:04:35 lr: 0.000049 grad: 0.1436 (0.1642) loss: 0.7888 (0.7918) time: 0.2686 data: 0.0002 max mem: 26157 Train: [58] [5400/6250] eta: 0:04:07 lr: 0.000049 grad: 0.1479 (0.1645) loss: 0.7819 (0.7917) time: 0.2682 data: 0.0001 max mem: 26157 Train: [58] [5500/6250] eta: 0:03:37 lr: 0.000049 grad: 0.1549 (0.1644) loss: 0.7786 (0.7916) time: 0.2714 data: 0.0002 max mem: 26157 Train: [58] [5600/6250] eta: 0:03:08 lr: 0.000049 grad: 0.1640 (0.1647) loss: 0.7804 (0.7915) time: 0.2697 data: 0.0002 max mem: 26157 Train: [58] [5700/6250] eta: 0:02:39 lr: 0.000049 grad: 0.1530 (0.1647) loss: 0.7828 (0.7913) time: 0.2710 data: 0.0001 max mem: 26157 Train: [58] [5800/6250] eta: 0:02:10 lr: 0.000049 grad: 0.1471 (0.1651) loss: 0.7954 (0.7913) time: 0.2713 data: 0.0002 max mem: 26157 Train: [58] [5900/6250] eta: 0:01:41 lr: 0.000049 grad: 0.1414 (0.1650) loss: 0.7964 (0.7912) time: 0.2697 data: 0.0002 max mem: 26157 Train: [58] [6000/6250] eta: 0:01:12 lr: 0.000049 grad: 0.1489 (0.1652) loss: 0.7857 (0.7912) time: 0.2705 data: 0.0002 max mem: 26157 Train: [58] [6100/6250] eta: 0:00:43 lr: 0.000049 grad: 0.1541 (0.1656) loss: 0.7838 (0.7911) time: 0.2676 data: 0.0001 max mem: 26157 Train: [58] [6200/6250] eta: 0:00:14 lr: 0.000049 grad: 0.1394 (0.1657) loss: 0.7942 (0.7910) time: 0.2698 data: 0.0002 max mem: 26157 Train: [58] [6249/6250] eta: 0:00:00 lr: 0.000049 grad: 0.1490 (0.1658) loss: 0.7916 (0.7910) time: 0.2695 data: 0.0002 max mem: 26157 Train: [58] Total time: 0:30:08 (0.2894 s / it) Averaged stats: lr: 0.000049 grad: 0.1490 (0.1658) loss: 0.7916 (0.7910) Eval (hcp-train-subset): [58] [ 0/62] eta: 0:03:21 loss: 0.8289 (0.8289) time: 3.2461 data: 3.1346 max mem: 26157 Eval (hcp-train-subset): [58] [61/62] eta: 0:00:00 loss: 0.8173 (0.8186) time: 0.1220 data: 0.0393 max mem: 26157 Eval (hcp-train-subset): [58] Total time: 0:00:12 (0.2078 s / it) Averaged stats (hcp-train-subset): loss: 0.8173 (0.8186) Making plots (hcp-train-subset): example=27 Eval (hcp-val): [58] [ 0/62] eta: 0:03:53 loss: 0.8219 (0.8219) time: 3.7597 data: 3.6590 max mem: 26157 Eval (hcp-val): [58] [61/62] eta: 0:00:00 loss: 0.8233 (0.8247) time: 0.1250 data: 0.0404 max mem: 26157 Eval (hcp-val): [58] Total time: 0:00:12 (0.2096 s / it) Averaged stats (hcp-val): loss: 0.8233 (0.8247) Making plots (hcp-val): example=61 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [59] [ 0/6250] eta: 11:29:21 lr: 0.000049 grad: 0.1681 (0.1681) loss: 0.8675 (0.8675) time: 6.6178 data: 6.3411 max mem: 26157 Train: [59] [ 100/6250] eta: 0:34:08 lr: 0.000049 grad: 0.1502 (0.2012) loss: 0.8137 (0.8111) time: 0.2700 data: 0.0002 max mem: 26157 Train: [59] [ 200/6250] eta: 0:30:26 lr: 0.000049 grad: 0.1354 (0.1764) loss: 0.8088 (0.8079) time: 0.2686 data: 0.0002 max mem: 26157 Train: [59] [ 300/6250] eta: 0:28:58 lr: 0.000049 grad: 0.1486 (0.1728) loss: 0.8012 (0.8059) time: 0.2744 data: 0.0002 max mem: 26157 Train: [59] [ 400/6250] eta: 0:27:57 lr: 0.000049 grad: 0.1424 (0.1709) loss: 0.7905 (0.8037) time: 0.2719 data: 0.0002 max mem: 26157 Train: [59] [ 500/6250] eta: 0:27:10 lr: 0.000049 grad: 0.1535 (0.1706) loss: 0.7941 (0.8018) time: 0.2694 data: 0.0002 max mem: 26157 Train: [59] [ 600/6250] eta: 0:26:28 lr: 0.000049 grad: 0.1360 (0.1692) loss: 0.8000 (0.8011) time: 0.2684 data: 0.0002 max mem: 26157 Train: [59] [ 700/6250] eta: 0:26:56 lr: 0.000049 grad: 0.1386 (0.1664) loss: 0.8002 (0.8007) time: 0.2708 data: 0.0002 max mem: 26157 Train: [59] [ 800/6250] eta: 0:26:22 lr: 0.000049 grad: 0.1345 (0.1643) loss: 0.7969 (0.8004) time: 0.2743 data: 0.0002 max mem: 26157 Train: [59] [ 900/6250] eta: 0:26:14 lr: 0.000049 grad: 0.1349 (0.1627) loss: 0.7946 (0.8000) time: 0.3778 data: 0.1061 max mem: 26157 Train: [59] [1000/6250] eta: 0:25:32 lr: 0.000049 grad: 0.1298 (0.1614) loss: 0.7903 (0.7996) time: 0.2689 data: 0.0002 max mem: 26157 Train: [59] [1100/6250] eta: 0:24:53 lr: 0.000049 grad: 0.1400 (0.1612) loss: 0.7928 (0.7991) time: 0.2733 data: 0.0003 max mem: 26157 Train: [59] [1200/6250] eta: 0:24:24 lr: 0.000049 grad: 0.1331 (0.1609) loss: 0.7935 (0.7987) time: 0.2719 data: 0.0002 max mem: 26157 Train: [59] [1300/6250] eta: 0:23:47 lr: 0.000049 grad: 0.1497 (0.1624) loss: 0.7926 (0.7982) time: 0.2702 data: 0.0002 max mem: 26157 Train: [59] [1400/6250] eta: 0:23:12 lr: 0.000049 grad: 0.1467 (0.1632) loss: 0.7915 (0.7978) time: 0.2702 data: 0.0002 max mem: 26157 Train: [59] [1500/6250] eta: 0:22:38 lr: 0.000049 grad: 0.1421 (0.1633) loss: 0.7859 (0.7972) time: 0.2692 data: 0.0002 max mem: 26157 Train: [59] [1600/6250] eta: 0:22:05 lr: 0.000049 grad: 0.1339 (0.1627) loss: 0.7938 (0.7968) time: 0.2739 data: 0.0002 max mem: 26157 Train: [59] [1700/6250] eta: 0:21:58 lr: 0.000049 grad: 0.1344 (0.1630) loss: 0.7897 (0.7963) time: 0.2684 data: 0.0002 max mem: 26157 Train: [59] [1800/6250] eta: 0:21:24 lr: 0.000049 grad: 0.1382 (0.1630) loss: 0.7852 (0.7956) time: 0.2681 data: 0.0002 max mem: 26157 Train: [59] [1900/6250] eta: 0:20:50 lr: 0.000049 grad: 0.1412 (0.1630) loss: 0.7867 (0.7951) time: 0.2682 data: 0.0002 max mem: 26157 Train: [59] [2000/6250] eta: 0:20:17 lr: 0.000049 grad: 0.1469 (0.1633) loss: 0.7817 (0.7947) time: 0.2689 data: 0.0002 max mem: 26157 Train: [59] [2100/6250] eta: 0:19:46 lr: 0.000048 grad: 0.1326 (0.1633) loss: 0.7937 (0.7943) time: 0.2688 data: 0.0002 max mem: 26157 Train: [59] [2200/6250] eta: 0:19:14 lr: 0.000048 grad: 0.1519 (0.1636) loss: 0.7889 (0.7941) time: 0.2702 data: 0.0002 max mem: 26157 Train: [59] [2300/6250] eta: 0:18:43 lr: 0.000048 grad: 0.1463 (0.1650) loss: 0.7854 (0.7939) time: 0.2693 data: 0.0002 max mem: 26157 Train: [59] [2400/6250] eta: 0:18:12 lr: 0.000048 grad: 0.1594 (0.1647) loss: 0.7930 (0.7936) time: 0.2687 data: 0.0002 max mem: 26157 Train: [59] [2500/6250] eta: 0:17:42 lr: 0.000048 grad: 0.1398 (0.1647) loss: 0.7819 (0.7932) time: 0.2696 data: 0.0002 max mem: 26157 Train: [59] [2600/6250] eta: 0:17:12 lr: 0.000048 grad: 0.1441 (0.1645) loss: 0.7807 (0.7930) time: 0.2696 data: 0.0002 max mem: 26157 Train: [59] [2700/6250] eta: 0:16:42 lr: 0.000048 grad: 0.1436 (0.1648) loss: 0.7845 (0.7927) time: 0.2719 data: 0.0003 max mem: 26157 Train: [59] [2800/6250] eta: 0:16:12 lr: 0.000048 grad: 0.1468 (0.1646) loss: 0.7928 (0.7924) time: 0.2720 data: 0.0002 max mem: 26157 Train: [59] [2900/6250] eta: 0:15:46 lr: 0.000048 grad: 0.1404 (0.1647) loss: 0.7903 (0.7923) time: 0.4123 data: 0.1355 max mem: 26157 Train: [59] [3000/6250] eta: 0:15:27 lr: 0.000048 grad: 0.1562 (0.1649) loss: 0.7872 (0.7921) time: 0.2715 data: 0.0002 max mem: 26157 Train: [59] [3100/6250] eta: 0:14:56 lr: 0.000048 grad: 0.1477 (0.1651) loss: 0.7872 (0.7920) time: 0.2692 data: 0.0002 max mem: 26157 Train: [59] [3200/6250] eta: 0:14:27 lr: 0.000048 grad: 0.1477 (0.1650) loss: 0.7958 (0.7919) time: 0.2688 data: 0.0001 max mem: 26157 Train: [59] [3300/6250] eta: 0:13:57 lr: 0.000048 grad: 0.1454 (0.1656) loss: 0.7860 (0.7918) time: 0.2686 data: 0.0002 max mem: 26157 Train: [59] [3400/6250] eta: 0:13:27 lr: 0.000048 grad: 0.1475 (0.1659) loss: 0.7948 (0.7917) time: 0.2687 data: 0.0002 max mem: 26157 Train: [59] [3500/6250] eta: 0:12:58 lr: 0.000048 grad: 0.1500 (0.1664) loss: 0.7838 (0.7915) time: 0.2696 data: 0.0002 max mem: 26157 Train: [59] [3600/6250] eta: 0:12:29 lr: 0.000048 grad: 0.1499 (0.1664) loss: 0.7834 (0.7915) time: 0.2737 data: 0.0002 max mem: 26157 Train: [59] [3700/6250] eta: 0:11:59 lr: 0.000048 grad: 0.1560 (0.1670) loss: 0.7865 (0.7914) time: 0.2705 data: 0.0002 max mem: 26157 Train: [59] [3800/6250] eta: 0:11:30 lr: 0.000048 grad: 0.1488 (0.1678) loss: 0.7919 (0.7913) time: 0.2698 data: 0.0002 max mem: 26157 Train: [59] [3900/6250] eta: 0:11:10 lr: 0.000048 grad: 0.1494 (0.1680) loss: 0.7861 (0.7913) time: 0.2915 data: 0.0003 max mem: 26157 Train: [59] [4000/6250] eta: 0:10:41 lr: 0.000048 grad: 0.1533 (0.1684) loss: 0.7936 (0.7911) time: 0.2726 data: 0.0002 max mem: 26157 Train: [59] [4100/6250] eta: 0:10:12 lr: 0.000048 grad: 0.1584 (0.1690) loss: 0.7837 (0.7910) time: 0.2701 data: 0.0002 max mem: 26157 Train: [59] [4200/6250] eta: 0:09:43 lr: 0.000048 grad: 0.1492 (0.1689) loss: 0.7780 (0.7908) time: 0.2694 data: 0.0002 max mem: 26157 Train: [59] [4300/6250] eta: 0:09:14 lr: 0.000048 grad: 0.1476 (0.1692) loss: 0.7847 (0.7906) time: 0.2712 data: 0.0002 max mem: 26157 Train: [59] [4400/6250] eta: 0:08:45 lr: 0.000048 grad: 0.1510 (0.1700) loss: 0.7832 (0.7905) time: 0.2765 data: 0.0002 max mem: 26157 Train: [59] [4500/6250] eta: 0:08:17 lr: 0.000048 grad: 0.1398 (0.1703) loss: 0.7891 (0.7904) time: 0.3919 data: 0.1226 max mem: 26157 Train: [59] [4600/6250] eta: 0:07:53 lr: 0.000048 grad: 0.1384 (0.1705) loss: 0.7875 (0.7903) time: 0.5807 data: 0.3038 max mem: 26157 Train: [59] [4700/6250] eta: 0:07:23 lr: 0.000048 grad: 0.1377 (0.1708) loss: 0.7880 (0.7902) time: 0.2707 data: 0.0002 max mem: 26157 Train: [59] [4800/6250] eta: 0:06:56 lr: 0.000048 grad: 0.1536 (0.1715) loss: 0.7879 (0.7902) time: 0.4791 data: 0.2085 max mem: 26157 Train: [59] [4900/6250] eta: 0:06:27 lr: 0.000048 grad: 0.1405 (0.1713) loss: 0.7919 (0.7902) time: 0.2746 data: 0.0002 max mem: 26157 Train: [59] [5000/6250] eta: 0:05:58 lr: 0.000048 grad: 0.1495 (0.1716) loss: 0.7889 (0.7901) time: 0.2711 data: 0.0002 max mem: 26157 Train: [59] [5100/6250] eta: 0:05:31 lr: 0.000048 grad: 0.1496 (0.1724) loss: 0.7878 (0.7901) time: 0.2694 data: 0.0002 max mem: 26157 Train: [59] [5200/6250] eta: 0:05:02 lr: 0.000047 grad: 0.1473 (0.1724) loss: 0.7910 (0.7901) time: 0.2694 data: 0.0002 max mem: 26157 Train: [59] [5300/6250] eta: 0:04:33 lr: 0.000047 grad: 0.1407 (0.1722) loss: 0.7948 (0.7901) time: 0.2684 data: 0.0002 max mem: 26157 Train: [59] [5400/6250] eta: 0:04:04 lr: 0.000047 grad: 0.1472 (0.1726) loss: 0.7842 (0.7901) time: 0.2689 data: 0.0002 max mem: 26157 Train: [59] [5500/6250] eta: 0:03:35 lr: 0.000047 grad: 0.1486 (0.1729) loss: 0.7807 (0.7901) time: 0.2697 data: 0.0001 max mem: 26157 Train: [59] [5600/6250] eta: 0:03:08 lr: 0.000047 grad: 0.1412 (0.1734) loss: 0.7851 (0.7901) time: 1.2379 data: 0.9620 max mem: 26157 Train: [59] [5700/6250] eta: 0:02:39 lr: 0.000047 grad: 0.1509 (0.1732) loss: 0.7910 (0.7901) time: 0.2701 data: 0.0002 max mem: 26157 Train: [59] [5800/6250] eta: 0:02:10 lr: 0.000047 grad: 0.1430 (0.1733) loss: 0.7907 (0.7901) time: 0.2689 data: 0.0002 max mem: 26157 Train: [59] [5900/6250] eta: 0:01:41 lr: 0.000047 grad: 0.1404 (0.1733) loss: 0.7894 (0.7901) time: 0.2689 data: 0.0002 max mem: 26157 Train: [59] [6000/6250] eta: 0:01:12 lr: 0.000047 grad: 0.1518 (0.1732) loss: 0.7906 (0.7901) time: 0.2716 data: 0.0002 max mem: 26157 Train: [59] [6100/6250] eta: 0:00:43 lr: 0.000047 grad: 0.1438 (0.1733) loss: 0.7970 (0.7901) time: 0.2682 data: 0.0002 max mem: 26157 Train: [59] [6200/6250] eta: 0:00:14 lr: 0.000047 grad: 0.1570 (0.1735) loss: 0.7866 (0.7901) time: 0.2761 data: 0.0002 max mem: 26157 Train: [59] [6249/6250] eta: 0:00:00 lr: 0.000047 grad: 0.1541 (0.1736) loss: 0.7885 (0.7901) time: 0.2716 data: 0.0002 max mem: 26157 Train: [59] Total time: 0:30:07 (0.2893 s / it) Averaged stats: lr: 0.000047 grad: 0.1541 (0.1736) loss: 0.7885 (0.7901) Eval (hcp-train-subset): [59] [ 0/62] eta: 0:04:14 loss: 0.8277 (0.8277) time: 4.1119 data: 4.0094 max mem: 26157 Eval (hcp-train-subset): [59] [61/62] eta: 0:00:00 loss: 0.8145 (0.8174) time: 0.1264 data: 0.0416 max mem: 26157 Eval (hcp-train-subset): [59] Total time: 0:00:13 (0.2195 s / it) Averaged stats (hcp-train-subset): loss: 0.8145 (0.8174) Making plots (hcp-train-subset): example=60 Eval (hcp-val): [59] [ 0/62] eta: 0:05:52 loss: 0.8198 (0.8198) time: 5.6776 data: 5.5941 max mem: 26157 Eval (hcp-val): [59] [61/62] eta: 0:00:00 loss: 0.8244 (0.8252) time: 0.1366 data: 0.0538 max mem: 26157 Eval (hcp-val): [59] Total time: 0:00:13 (0.2187 s / it) Averaged stats (hcp-val): loss: 0.8244 (0.8252) Making plots (hcp-val): example=14 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [60] [ 0/6250] eta: 10:44:20 lr: 0.000047 grad: 0.1166 (0.1166) loss: 0.8679 (0.8679) time: 6.1857 data: 5.9121 max mem: 26157 Train: [60] [ 100/6250] eta: 0:33:50 lr: 0.000047 grad: 0.1402 (0.1793) loss: 0.8028 (0.8089) time: 0.2725 data: 0.0002 max mem: 26157 Train: [60] [ 200/6250] eta: 0:30:22 lr: 0.000047 grad: 0.1476 (0.1717) loss: 0.7960 (0.8067) time: 0.2705 data: 0.0002 max mem: 26157 Train: [60] [ 300/6250] eta: 0:28:52 lr: 0.000047 grad: 0.1530 (0.1712) loss: 0.7945 (0.8039) time: 0.2707 data: 0.0002 max mem: 26157 Train: [60] [ 400/6250] eta: 0:27:52 lr: 0.000047 grad: 0.1593 (0.1737) loss: 0.7982 (0.8018) time: 0.2709 data: 0.0002 max mem: 26157 Train: [60] [ 500/6250] eta: 0:27:08 lr: 0.000047 grad: 0.1524 (0.1761) loss: 0.7979 (0.8002) time: 0.2757 data: 0.0004 max mem: 26157 Train: [60] [ 600/6250] eta: 0:27:03 lr: 0.000047 grad: 0.1488 (0.1763) loss: 0.7880 (0.7991) time: 0.2686 data: 0.0002 max mem: 26157 Train: [60] [ 700/6250] eta: 0:26:21 lr: 0.000047 grad: 0.1492 (0.1770) loss: 0.7936 (0.7986) time: 0.2711 data: 0.0005 max mem: 26157 Train: [60] [ 800/6250] eta: 0:25:42 lr: 0.000047 grad: 0.1457 (0.1766) loss: 0.7863 (0.7981) time: 0.2685 data: 0.0002 max mem: 26157 Train: [60] [ 900/6250] eta: 0:25:05 lr: 0.000047 grad: 0.1476 (0.1785) loss: 0.8021 (0.7976) time: 0.2699 data: 0.0003 max mem: 26157 Train: [60] [1000/6250] eta: 0:24:31 lr: 0.000047 grad: 0.1366 (0.1765) loss: 0.7929 (0.7973) time: 0.2704 data: 0.0002 max mem: 26157 Train: [60] [1100/6250] eta: 0:23:58 lr: 0.000047 grad: 0.1657 (0.1754) loss: 0.7896 (0.7967) time: 0.2699 data: 0.0002 max mem: 26157 Train: [60] [1200/6250] eta: 0:23:26 lr: 0.000047 grad: 0.1407 (0.1750) loss: 0.7832 (0.7962) time: 0.2700 data: 0.0002 max mem: 26157 Train: [60] [1300/6250] eta: 0:22:55 lr: 0.000047 grad: 0.1530 (0.1745) loss: 0.7873 (0.7959) time: 0.2703 data: 0.0002 max mem: 26157 Train: [60] [1400/6250] eta: 0:22:25 lr: 0.000047 grad: 0.1475 (0.1741) loss: 0.7844 (0.7956) time: 0.2689 data: 0.0002 max mem: 26157 Train: [60] [1500/6250] eta: 0:21:55 lr: 0.000047 grad: 0.1431 (0.1740) loss: 0.7982 (0.7954) time: 0.2717 data: 0.0002 max mem: 26157 Train: [60] [1600/6250] eta: 0:21:25 lr: 0.000047 grad: 0.1380 (0.1739) loss: 0.7899 (0.7950) time: 0.2698 data: 0.0002 max mem: 26157 Train: [60] [1700/6250] eta: 0:20:56 lr: 0.000047 grad: 0.1377 (0.1730) loss: 0.7963 (0.7949) time: 0.2706 data: 0.0002 max mem: 26157 Train: [60] [1800/6250] eta: 0:20:27 lr: 0.000047 grad: 0.1393 (0.1724) loss: 0.7818 (0.7946) time: 0.2759 data: 0.0003 max mem: 26157 Train: [60] [1900/6250] eta: 0:19:59 lr: 0.000047 grad: 0.1394 (0.1725) loss: 0.7950 (0.7945) time: 0.2713 data: 0.0002 max mem: 26157 Train: [60] [2000/6250] eta: 0:19:31 lr: 0.000047 grad: 0.1474 (0.1735) loss: 0.7923 (0.7943) time: 0.2783 data: 0.0002 max mem: 26157 Train: [60] [2100/6250] eta: 0:19:02 lr: 0.000046 grad: 0.1578 (0.1730) loss: 0.7938 (0.7942) time: 0.2720 data: 0.0002 max mem: 26157 Train: [60] [2200/6250] eta: 0:18:34 lr: 0.000046 grad: 0.1483 (0.1739) loss: 0.7924 (0.7942) time: 0.2803 data: 0.0002 max mem: 26157 Train: [60] [2300/6250] eta: 0:18:12 lr: 0.000046 grad: 0.1566 (0.1745) loss: 0.7912 (0.7942) time: 0.2704 data: 0.0002 max mem: 26157 Train: [60] [2400/6250] eta: 0:17:58 lr: 0.000046 grad: 0.1673 (0.1756) loss: 0.7902 (0.7940) time: 0.5318 data: 0.2515 max mem: 26157 Train: [60] [2500/6250] eta: 0:17:29 lr: 0.000046 grad: 0.1356 (0.1753) loss: 0.7890 (0.7939) time: 0.2724 data: 0.0002 max mem: 26157 Train: [60] [2600/6250] eta: 0:17:00 lr: 0.000046 grad: 0.1389 (0.1750) loss: 0.7943 (0.7938) time: 0.2703 data: 0.0002 max mem: 26157 Train: [60] [2700/6250] eta: 0:16:30 lr: 0.000046 grad: 0.1452 (0.1754) loss: 0.7983 (0.7938) time: 0.2686 data: 0.0002 max mem: 26157 Train: [60] [2800/6250] eta: 0:16:01 lr: 0.000046 grad: 0.1464 (0.1757) loss: 0.7969 (0.7939) time: 0.2692 data: 0.0001 max mem: 26157 Train: [60] [2900/6250] eta: 0:15:36 lr: 0.000046 grad: 0.1634 (0.1768) loss: 0.7918 (0.7940) time: 0.4052 data: 0.1321 max mem: 26157 Train: [60] [3000/6250] eta: 0:15:07 lr: 0.000046 grad: 0.1511 (0.1774) loss: 0.7964 (0.7940) time: 0.2683 data: 0.0002 max mem: 26157 Train: [60] [3100/6250] eta: 0:14:38 lr: 0.000046 grad: 0.1469 (0.1778) loss: 0.7902 (0.7940) time: 0.2684 data: 0.0002 max mem: 26157 Train: [60] [3200/6250] eta: 0:14:09 lr: 0.000046 grad: 0.1356 (0.1781) loss: 0.7921 (0.7939) time: 0.2688 data: 0.0002 max mem: 26157 Train: [60] [3300/6250] eta: 0:13:40 lr: 0.000046 grad: 0.1521 (0.1777) loss: 0.7918 (0.7940) time: 0.2688 data: 0.0002 max mem: 26157 Train: [60] [3400/6250] eta: 0:13:12 lr: 0.000046 grad: 0.1492 (0.1774) loss: 0.7894 (0.7939) time: 0.2703 data: 0.0002 max mem: 26157 Train: [60] [3500/6250] eta: 0:12:43 lr: 0.000046 grad: 0.1427 (0.1768) loss: 0.7963 (0.7940) time: 0.2697 data: 0.0002 max mem: 26157 Train: [60] [3600/6250] eta: 0:12:15 lr: 0.000046 grad: 0.1442 (0.1765) loss: 0.7939 (0.7940) time: 0.2714 data: 0.0002 max mem: 26157 Train: [60] [3700/6250] eta: 0:11:47 lr: 0.000046 grad: 0.1500 (0.1764) loss: 0.7907 (0.7939) time: 0.2711 data: 0.0002 max mem: 26157 Train: [60] [3800/6250] eta: 0:11:19 lr: 0.000046 grad: 0.1397 (0.1758) loss: 0.7897 (0.7939) time: 0.2708 data: 0.0002 max mem: 26157 Train: [60] [3900/6250] eta: 0:10:50 lr: 0.000046 grad: 0.1378 (0.1758) loss: 0.7997 (0.7940) time: 0.2696 data: 0.0002 max mem: 26157 Train: [60] [4000/6250] eta: 0:10:22 lr: 0.000046 grad: 0.1472 (0.1758) loss: 0.7944 (0.7940) time: 0.2709 data: 0.0002 max mem: 26157 Train: [60] [4100/6250] eta: 0:09:54 lr: 0.000046 grad: 0.1577 (0.1762) loss: 0.7937 (0.7939) time: 0.2684 data: 0.0002 max mem: 26157 Train: [60] [4200/6250] eta: 0:09:29 lr: 0.000046 grad: 0.1460 (0.1770) loss: 0.7900 (0.7940) time: 0.2692 data: 0.0002 max mem: 26157 Train: [60] [4300/6250] eta: 0:09:01 lr: 0.000046 grad: 0.1411 (0.1769) loss: 0.7946 (0.7939) time: 0.2697 data: 0.0002 max mem: 26157 Train: [60] [4400/6250] eta: 0:08:33 lr: 0.000046 grad: 0.1425 (0.1769) loss: 0.7949 (0.7939) time: 0.2675 data: 0.0002 max mem: 26157 Train: [60] [4500/6250] eta: 0:08:05 lr: 0.000046 grad: 0.1516 (0.1769) loss: 0.7879 (0.7938) time: 0.3212 data: 0.0501 max mem: 26157 Train: [60] [4600/6250] eta: 0:07:37 lr: 0.000046 grad: 0.1441 (0.1769) loss: 0.7883 (0.7938) time: 0.2688 data: 0.0002 max mem: 26157 Train: [60] [4700/6250] eta: 0:07:09 lr: 0.000046 grad: 0.1508 (0.1773) loss: 0.7832 (0.7937) time: 0.2733 data: 0.0002 max mem: 26157 Train: [60] [4800/6250] eta: 0:06:41 lr: 0.000046 grad: 0.1564 (0.1774) loss: 0.7892 (0.7936) time: 0.2671 data: 0.0001 max mem: 26157 Train: [60] [4900/6250] eta: 0:06:14 lr: 0.000046 grad: 0.1781 (0.1798) loss: 0.7896 (0.7935) time: 0.2683 data: 0.0002 max mem: 26157 Train: [60] [5000/6250] eta: 0:05:46 lr: 0.000046 grad: 0.1480 (0.1795) loss: 0.7902 (0.7934) time: 0.2689 data: 0.0002 max mem: 26157 Train: [60] [5100/6250] eta: 0:05:18 lr: 0.000046 grad: 0.1507 (0.1797) loss: 0.7917 (0.7933) time: 0.2683 data: 0.0002 max mem: 26157 Train: [60] [5200/6250] eta: 0:04:50 lr: 0.000045 grad: 0.1474 (0.1797) loss: 0.7926 (0.7932) time: 0.2692 data: 0.0002 max mem: 26157 Train: [60] [5300/6250] eta: 0:04:22 lr: 0.000045 grad: 0.1557 (0.1800) loss: 0.7913 (0.7931) time: 0.2687 data: 0.0002 max mem: 26157 Train: [60] [5400/6250] eta: 0:03:54 lr: 0.000045 grad: 0.1566 (0.1802) loss: 0.7932 (0.7932) time: 0.2699 data: 0.0002 max mem: 26157 Train: [60] [5500/6250] eta: 0:03:27 lr: 0.000045 grad: 0.1469 (0.1802) loss: 0.7930 (0.7932) time: 0.2685 data: 0.0002 max mem: 26157 Train: [60] [5600/6250] eta: 0:02:59 lr: 0.000045 grad: 0.1471 (0.1807) loss: 0.7959 (0.7931) time: 0.2699 data: 0.0002 max mem: 26157 Train: [60] [5700/6250] eta: 0:02:31 lr: 0.000045 grad: 0.1593 (0.1814) loss: 0.7876 (0.7931) time: 0.2707 data: 0.0002 max mem: 26157 Train: [60] [5800/6250] eta: 0:02:04 lr: 0.000045 grad: 0.1524 (0.1815) loss: 0.7845 (0.7930) time: 0.2681 data: 0.0002 max mem: 26157 Train: [60] [5900/6250] eta: 0:01:36 lr: 0.000045 grad: 0.1499 (0.1817) loss: 0.7855 (0.7930) time: 0.2689 data: 0.0002 max mem: 26157 Train: [60] [6000/6250] eta: 0:01:08 lr: 0.000045 grad: 0.1528 (0.1816) loss: 0.7947 (0.7929) time: 0.2701 data: 0.0002 max mem: 26157 Train: [60] [6100/6250] eta: 0:00:41 lr: 0.000045 grad: 0.1540 (0.1814) loss: 0.7898 (0.7929) time: 0.2692 data: 0.0002 max mem: 26157 Train: [60] [6200/6250] eta: 0:00:13 lr: 0.000045 grad: 0.1520 (0.1819) loss: 0.7897 (0.7928) time: 0.2750 data: 0.0002 max mem: 26157 Train: [60] [6249/6250] eta: 0:00:00 lr: 0.000045 grad: 0.1518 (0.1819) loss: 0.7929 (0.7928) time: 0.2705 data: 0.0002 max mem: 26157 Train: [60] Total time: 0:28:49 (0.2767 s / it) Averaged stats: lr: 0.000045 grad: 0.1518 (0.1819) loss: 0.7929 (0.7928) Eval (hcp-train-subset): [60] [ 0/62] eta: 0:03:52 loss: 0.8252 (0.8252) time: 3.7458 data: 3.6363 max mem: 26157 Eval (hcp-train-subset): [60] [61/62] eta: 0:00:00 loss: 0.8151 (0.8165) time: 0.1209 data: 0.0380 max mem: 26157 Eval (hcp-train-subset): [60] Total time: 0:00:13 (0.2136 s / it) Averaged stats (hcp-train-subset): loss: 0.8151 (0.8165) Making plots (hcp-train-subset): example=10 Eval (hcp-val): [60] [ 0/62] eta: 0:05:32 loss: 0.8185 (0.8185) time: 5.3575 data: 5.2728 max mem: 26157 Eval (hcp-val): [60] [61/62] eta: 0:00:00 loss: 0.8254 (0.8248) time: 0.1259 data: 0.0413 max mem: 26157 Eval (hcp-val): [60] Total time: 0:00:13 (0.2127 s / it) Averaged stats (hcp-val): loss: 0.8254 (0.8248) Making plots (hcp-val): example=57 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [61] [ 0/6250] eta: 11:00:24 lr: 0.000045 grad: 0.1489 (0.1489) loss: 0.8429 (0.8429) time: 6.3399 data: 6.0610 max mem: 26157 Train: [61] [ 100/6250] eta: 0:34:17 lr: 0.000045 grad: 0.1611 (0.2136) loss: 0.8087 (0.8137) time: 0.2693 data: 0.0002 max mem: 26157 Train: [61] [ 200/6250] eta: 0:30:30 lr: 0.000045 grad: 0.1849 (0.2110) loss: 0.8004 (0.8049) time: 0.2710 data: 0.0002 max mem: 26157 Train: [61] [ 300/6250] eta: 0:28:55 lr: 0.000045 grad: 0.1431 (0.2046) loss: 0.7950 (0.8022) time: 0.2696 data: 0.0002 max mem: 26157 Train: [61] [ 400/6250] eta: 0:27:53 lr: 0.000045 grad: 0.1646 (0.2021) loss: 0.7888 (0.8012) time: 0.2702 data: 0.0002 max mem: 26157 Train: [61] [ 500/6250] eta: 0:27:05 lr: 0.000045 grad: 0.1489 (0.1997) loss: 0.7958 (0.8004) time: 0.2690 data: 0.0002 max mem: 26157 Train: [61] [ 600/6250] eta: 0:26:27 lr: 0.000045 grad: 0.1542 (0.1968) loss: 0.8049 (0.7999) time: 0.2711 data: 0.0002 max mem: 26157 Train: [61] [ 700/6250] eta: 0:25:52 lr: 0.000045 grad: 0.1727 (0.1966) loss: 0.8039 (0.7995) time: 0.2738 data: 0.0002 max mem: 26157 Train: [61] [ 800/6250] eta: 0:26:03 lr: 0.000045 grad: 0.1440 (0.1942) loss: 0.7894 (0.7995) time: 0.2733 data: 0.0004 max mem: 26157 Train: [61] [ 900/6250] eta: 0:25:59 lr: 0.000045 grad: 0.1470 (0.1934) loss: 0.7988 (0.7992) time: 0.2710 data: 0.0002 max mem: 26157 Train: [61] [1000/6250] eta: 0:25:28 lr: 0.000045 grad: 0.1545 (0.1912) loss: 0.7939 (0.7987) time: 0.2735 data: 0.0002 max mem: 26157 Train: [61] [1100/6250] eta: 0:24:49 lr: 0.000045 grad: 0.1608 (0.1901) loss: 0.7924 (0.7981) time: 0.2716 data: 0.0003 max mem: 26157 Train: [61] [1200/6250] eta: 0:24:30 lr: 0.000045 grad: 0.1586 (0.1890) loss: 0.7917 (0.7975) time: 0.3384 data: 0.0675 max mem: 26157 Train: [61] [1300/6250] eta: 0:23:53 lr: 0.000045 grad: 0.1503 (0.1871) loss: 0.7933 (0.7970) time: 0.2759 data: 0.0003 max mem: 26157 Train: [61] [1400/6250] eta: 0:23:25 lr: 0.000045 grad: 0.1461 (0.1858) loss: 0.7892 (0.7965) time: 0.2693 data: 0.0002 max mem: 26157 Train: [61] [1500/6250] eta: 0:22:50 lr: 0.000045 grad: 0.1524 (0.1867) loss: 0.7888 (0.7960) time: 0.2681 data: 0.0002 max mem: 26157 Train: [61] [1600/6250] eta: 0:22:15 lr: 0.000045 grad: 0.1594 (0.1862) loss: 0.7930 (0.7956) time: 0.2694 data: 0.0002 max mem: 26157 Train: [61] [1700/6250] eta: 0:21:42 lr: 0.000045 grad: 0.1587 (0.1879) loss: 0.7848 (0.7952) time: 0.2683 data: 0.0002 max mem: 26157 Train: [61] [1800/6250] eta: 0:21:09 lr: 0.000045 grad: 0.1507 (0.1886) loss: 0.7922 (0.7949) time: 0.2686 data: 0.0002 max mem: 26157 Train: [61] [1900/6250] eta: 0:20:47 lr: 0.000045 grad: 0.1595 (0.1891) loss: 0.7871 (0.7946) time: 0.2696 data: 0.0002 max mem: 26157 Train: [61] [2000/6250] eta: 0:20:17 lr: 0.000045 grad: 0.1732 (0.1894) loss: 0.7851 (0.7945) time: 0.3221 data: 0.0485 max mem: 26157 Train: [61] [2100/6250] eta: 0:19:45 lr: 0.000044 grad: 0.1500 (0.1906) loss: 0.7899 (0.7942) time: 0.2677 data: 0.0001 max mem: 26157 Train: [61] [2200/6250] eta: 0:19:14 lr: 0.000044 grad: 0.1477 (0.1900) loss: 0.7862 (0.7939) time: 0.2698 data: 0.0002 max mem: 26157 Train: [61] [2300/6250] eta: 0:18:43 lr: 0.000044 grad: 0.1463 (0.1890) loss: 0.7898 (0.7938) time: 0.2687 data: 0.0002 max mem: 26157 Train: [61] [2400/6250] eta: 0:18:12 lr: 0.000044 grad: 0.1632 (0.1893) loss: 0.7896 (0.7936) time: 0.2707 data: 0.0002 max mem: 26157 Train: [61] [2500/6250] eta: 0:17:41 lr: 0.000044 grad: 0.1503 (0.1898) loss: 0.7932 (0.7935) time: 0.2687 data: 0.0002 max mem: 26157 Train: [61] [2600/6250] eta: 0:17:11 lr: 0.000044 grad: 0.1471 (0.1896) loss: 0.7921 (0.7934) time: 0.2692 data: 0.0002 max mem: 26157 Train: [61] [2700/6250] eta: 0:16:41 lr: 0.000044 grad: 0.1830 (0.1897) loss: 0.7860 (0.7932) time: 0.2700 data: 0.0002 max mem: 26157 Train: [61] [2800/6250] eta: 0:16:11 lr: 0.000044 grad: 0.1692 (0.1894) loss: 0.7889 (0.7932) time: 0.2700 data: 0.0002 max mem: 26157 Train: [61] [2900/6250] eta: 0:15:42 lr: 0.000044 grad: 0.1364 (0.1890) loss: 0.7931 (0.7931) time: 0.2705 data: 0.0002 max mem: 26157 Train: [61] [3000/6250] eta: 0:15:12 lr: 0.000044 grad: 0.1681 (0.1888) loss: 0.7923 (0.7931) time: 0.2694 data: 0.0002 max mem: 26157 Train: [61] [3100/6250] eta: 0:14:43 lr: 0.000044 grad: 0.1493 (0.1888) loss: 0.7845 (0.7930) time: 0.2696 data: 0.0002 max mem: 26157 Train: [61] [3200/6250] eta: 0:14:14 lr: 0.000044 grad: 0.1613 (0.1897) loss: 0.7892 (0.7928) time: 0.2681 data: 0.0002 max mem: 26157 Train: [61] [3300/6250] eta: 0:13:45 lr: 0.000044 grad: 0.1788 (0.1902) loss: 0.7849 (0.7928) time: 0.2686 data: 0.0002 max mem: 26157 Train: [61] [3400/6250] eta: 0:13:16 lr: 0.000044 grad: 0.2301 (0.1920) loss: 0.7853 (0.7927) time: 0.2694 data: 0.0002 max mem: 26157 Train: [61] [3500/6250] eta: 0:12:48 lr: 0.000044 grad: 0.1485 (0.1919) loss: 0.7929 (0.7927) time: 0.2703 data: 0.0002 max mem: 26157 Train: [61] [3600/6250] eta: 0:12:19 lr: 0.000044 grad: 0.1678 (0.1926) loss: 0.7922 (0.7927) time: 0.2687 data: 0.0002 max mem: 26157 Train: [61] [3700/6250] eta: 0:11:51 lr: 0.000044 grad: 0.1651 (0.1929) loss: 0.7882 (0.7926) time: 0.2688 data: 0.0002 max mem: 26157 Train: [61] [3800/6250] eta: 0:11:22 lr: 0.000044 grad: 0.1566 (0.1929) loss: 0.7857 (0.7926) time: 0.2717 data: 0.0002 max mem: 26157 Train: [61] [3900/6250] eta: 0:10:54 lr: 0.000044 grad: 0.1927 (0.1933) loss: 0.7862 (0.7926) time: 0.2693 data: 0.0002 max mem: 26157 Train: [61] [4000/6250] eta: 0:10:26 lr: 0.000044 grad: 0.1587 (0.1935) loss: 0.7899 (0.7926) time: 0.2684 data: 0.0002 max mem: 26157 Train: [61] [4100/6250] eta: 0:09:57 lr: 0.000044 grad: 0.1575 (0.1935) loss: 0.7968 (0.7926) time: 0.2677 data: 0.0002 max mem: 26157 Train: [61] [4200/6250] eta: 0:09:29 lr: 0.000044 grad: 0.1744 (0.1938) loss: 0.7918 (0.7926) time: 0.2686 data: 0.0002 max mem: 26157 Train: [61] [4300/6250] eta: 0:09:01 lr: 0.000044 grad: 0.2041 (0.1943) loss: 0.7956 (0.7926) time: 0.2676 data: 0.0002 max mem: 26157 Train: [61] [4400/6250] eta: 0:08:33 lr: 0.000044 grad: 0.1635 (0.1950) loss: 0.7885 (0.7926) time: 0.2723 data: 0.0002 max mem: 26157 Train: [61] [4500/6250] eta: 0:08:05 lr: 0.000044 grad: 0.1580 (0.1947) loss: 0.7905 (0.7925) time: 0.2694 data: 0.0002 max mem: 26157 Train: [61] [4600/6250] eta: 0:07:37 lr: 0.000044 grad: 0.1618 (0.1944) loss: 0.7863 (0.7925) time: 0.2683 data: 0.0002 max mem: 26157 Train: [61] [4700/6250] eta: 0:07:09 lr: 0.000044 grad: 0.1613 (0.1943) loss: 0.7920 (0.7925) time: 0.3148 data: 0.0359 max mem: 26157 Train: [61] [4800/6250] eta: 0:06:41 lr: 0.000044 grad: 0.1437 (0.1942) loss: 0.7939 (0.7925) time: 0.2705 data: 0.0002 max mem: 26157 Train: [61] [4900/6250] eta: 0:06:13 lr: 0.000044 grad: 0.1541 (0.1940) loss: 0.7915 (0.7925) time: 0.2714 data: 0.0002 max mem: 26157 Train: [61] [5000/6250] eta: 0:05:45 lr: 0.000044 grad: 0.1535 (0.1936) loss: 0.7936 (0.7924) time: 0.2740 data: 0.0002 max mem: 26157 Train: [61] [5100/6250] eta: 0:05:18 lr: 0.000044 grad: 0.1667 (0.1934) loss: 0.7883 (0.7924) time: 0.2698 data: 0.0002 max mem: 26157 Train: [61] [5200/6250] eta: 0:04:50 lr: 0.000044 grad: 0.1673 (0.1930) loss: 0.7870 (0.7924) time: 0.2750 data: 0.0002 max mem: 26157 Train: [61] [5300/6250] eta: 0:04:22 lr: 0.000043 grad: 0.1724 (0.1934) loss: 0.7934 (0.7924) time: 0.2753 data: 0.0002 max mem: 26157 Train: [61] [5400/6250] eta: 0:03:54 lr: 0.000043 grad: 0.1690 (0.1938) loss: 0.7954 (0.7924) time: 0.2715 data: 0.0002 max mem: 26157 Train: [61] [5500/6250] eta: 0:03:27 lr: 0.000043 grad: 0.1548 (0.1939) loss: 0.7944 (0.7923) time: 0.2699 data: 0.0002 max mem: 26157 Train: [61] [5600/6250] eta: 0:03:00 lr: 0.000043 grad: 0.1628 (0.1939) loss: 0.7971 (0.7923) time: 0.5896 data: 0.3198 max mem: 26157 Train: [61] [5700/6250] eta: 0:02:32 lr: 0.000043 grad: 0.1583 (0.1942) loss: 0.7812 (0.7923) time: 0.2685 data: 0.0002 max mem: 26157 Train: [61] [5800/6250] eta: 0:02:05 lr: 0.000043 grad: 0.1798 (0.1942) loss: 0.7854 (0.7922) time: 0.2712 data: 0.0002 max mem: 26157 Train: [61] [5900/6250] eta: 0:01:37 lr: 0.000043 grad: 0.1556 (0.1942) loss: 0.7887 (0.7922) time: 0.2696 data: 0.0002 max mem: 26157 Train: [61] [6000/6250] eta: 0:01:09 lr: 0.000043 grad: 0.1527 (0.1944) loss: 0.7916 (0.7922) time: 0.2692 data: 0.0002 max mem: 26157 Train: [61] [6100/6250] eta: 0:00:41 lr: 0.000043 grad: 0.1588 (0.1944) loss: 0.7868 (0.7921) time: 0.2719 data: 0.0002 max mem: 26157 Train: [61] [6200/6250] eta: 0:00:13 lr: 0.000043 grad: 0.1507 (0.1944) loss: 0.7893 (0.7921) time: 0.2678 data: 0.0001 max mem: 26157 Train: [61] [6249/6250] eta: 0:00:00 lr: 0.000043 grad: 0.1450 (0.1945) loss: 0.7887 (0.7921) time: 0.2704 data: 0.0002 max mem: 26157 Train: [61] Total time: 0:29:01 (0.2786 s / it) Averaged stats: lr: 0.000043 grad: 0.1450 (0.1945) loss: 0.7887 (0.7921) Eval (hcp-train-subset): [61] [ 0/62] eta: 0:04:53 loss: 0.8256 (0.8256) time: 4.7362 data: 4.6522 max mem: 26157 Eval (hcp-train-subset): [61] [61/62] eta: 0:00:00 loss: 0.8149 (0.8169) time: 0.1235 data: 0.0405 max mem: 26157 Eval (hcp-train-subset): [61] Total time: 0:00:13 (0.2104 s / it) Averaged stats (hcp-train-subset): loss: 0.8149 (0.8169) Making plots (hcp-train-subset): example=36 Eval (hcp-val): [61] [ 0/62] eta: 0:04:23 loss: 0.8239 (0.8239) time: 4.2540 data: 4.1454 max mem: 26157 Eval (hcp-val): [61] [61/62] eta: 0:00:00 loss: 0.8239 (0.8254) time: 0.1342 data: 0.0513 max mem: 26157 Eval (hcp-val): [61] Total time: 0:00:13 (0.2193 s / it) Averaged stats (hcp-val): loss: 0.8239 (0.8254) Making plots (hcp-val): example=27 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [62] [ 0/6250] eta: 11:49:07 lr: 0.000043 grad: 0.7045 (0.7045) loss: 0.7668 (0.7668) time: 6.8076 data: 6.5280 max mem: 26157 Train: [62] [ 100/6250] eta: 0:34:31 lr: 0.000043 grad: 0.1666 (0.2059) loss: 0.7945 (0.8088) time: 0.2719 data: 0.0002 max mem: 26157 Train: [62] [ 200/6250] eta: 0:30:45 lr: 0.000043 grad: 0.1904 (0.1958) loss: 0.7957 (0.8039) time: 0.2721 data: 0.0002 max mem: 26157 Train: [62] [ 300/6250] eta: 0:29:07 lr: 0.000043 grad: 0.1766 (0.1936) loss: 0.7935 (0.7996) time: 0.2692 data: 0.0002 max mem: 26157 Train: [62] [ 400/6250] eta: 0:28:05 lr: 0.000043 grad: 0.1524 (0.1931) loss: 0.7928 (0.7966) time: 0.2728 data: 0.0002 max mem: 26157 Train: [62] [ 500/6250] eta: 0:27:15 lr: 0.000043 grad: 0.1523 (0.1961) loss: 0.7860 (0.7945) time: 0.2683 data: 0.0002 max mem: 26157 Train: [62] [ 600/6250] eta: 0:26:33 lr: 0.000043 grad: 0.1529 (0.1964) loss: 0.7880 (0.7937) time: 0.2715 data: 0.0002 max mem: 26157 Train: [62] [ 700/6250] eta: 0:25:55 lr: 0.000043 grad: 0.1532 (0.1960) loss: 0.7944 (0.7935) time: 0.2699 data: 0.0002 max mem: 26157 Train: [62] [ 800/6250] eta: 0:25:19 lr: 0.000043 grad: 0.1871 (0.1977) loss: 0.7949 (0.7930) time: 0.2709 data: 0.0002 max mem: 26157 Train: [62] [ 900/6250] eta: 0:24:46 lr: 0.000043 grad: 0.1463 (0.1956) loss: 0.7935 (0.7928) time: 0.2686 data: 0.0002 max mem: 26157 Train: [62] [1000/6250] eta: 0:24:14 lr: 0.000043 grad: 0.1704 (0.1967) loss: 0.8035 (0.7928) time: 0.2694 data: 0.0002 max mem: 26157 Train: [62] [1100/6250] eta: 0:23:43 lr: 0.000043 grad: 0.1588 (0.1982) loss: 0.7879 (0.7925) time: 0.2685 data: 0.0002 max mem: 26157 Train: [62] [1200/6250] eta: 0:23:12 lr: 0.000043 grad: 0.1624 (0.1987) loss: 0.7984 (0.7921) time: 0.2682 data: 0.0002 max mem: 26157 Train: [62] [1300/6250] eta: 0:22:41 lr: 0.000043 grad: 0.1526 (0.1966) loss: 0.7906 (0.7918) time: 0.2680 data: 0.0002 max mem: 26157 Train: [62] [1400/6250] eta: 0:22:12 lr: 0.000043 grad: 0.1527 (0.1956) loss: 0.7914 (0.7917) time: 0.2692 data: 0.0002 max mem: 26157 Train: [62] [1500/6250] eta: 0:21:42 lr: 0.000043 grad: 0.1478 (0.1966) loss: 0.7883 (0.7914) time: 0.2688 data: 0.0002 max mem: 26157 Train: [62] [1600/6250] eta: 0:21:13 lr: 0.000043 grad: 0.1621 (0.1969) loss: 0.7967 (0.7913) time: 0.2676 data: 0.0002 max mem: 26157 Train: [62] [1700/6250] eta: 0:20:44 lr: 0.000043 grad: 0.1589 (0.1961) loss: 0.7874 (0.7910) time: 0.2671 data: 0.0002 max mem: 26157 Train: [62] [1800/6250] eta: 0:20:15 lr: 0.000043 grad: 0.1737 (0.1971) loss: 0.7911 (0.7909) time: 0.2681 data: 0.0002 max mem: 26157 Train: [62] [1900/6250] eta: 0:19:47 lr: 0.000043 grad: 0.1520 (0.1969) loss: 0.7882 (0.7908) time: 0.2717 data: 0.0002 max mem: 26157 Train: [62] [2000/6250] eta: 0:19:19 lr: 0.000043 grad: 0.1614 (0.1969) loss: 0.7915 (0.7907) time: 0.2680 data: 0.0002 max mem: 26157 Train: [62] [2100/6250] eta: 0:18:51 lr: 0.000043 grad: 0.1542 (0.1969) loss: 0.7834 (0.7906) time: 0.2685 data: 0.0002 max mem: 26157 Train: [62] [2200/6250] eta: 0:18:23 lr: 0.000042 grad: 0.1680 (0.1971) loss: 0.7825 (0.7904) time: 0.2684 data: 0.0002 max mem: 26157 Train: [62] [2300/6250] eta: 0:17:55 lr: 0.000042 grad: 0.1529 (0.1965) loss: 0.7829 (0.7901) time: 0.2677 data: 0.0002 max mem: 26157 Train: [62] [2400/6250] eta: 0:17:27 lr: 0.000042 grad: 0.1548 (0.1960) loss: 0.7890 (0.7899) time: 0.2687 data: 0.0002 max mem: 26157 Train: [62] [2500/6250] eta: 0:16:59 lr: 0.000042 grad: 0.1547 (0.1959) loss: 0.7886 (0.7897) time: 0.2672 data: 0.0002 max mem: 26157 Train: [62] [2600/6250] eta: 0:16:32 lr: 0.000042 grad: 0.1508 (0.1961) loss: 0.7858 (0.7896) time: 0.2689 data: 0.0002 max mem: 26157 Train: [62] [2700/6250] eta: 0:16:04 lr: 0.000042 grad: 0.1566 (0.1963) loss: 0.7899 (0.7895) time: 0.2681 data: 0.0002 max mem: 26157 Train: [62] [2800/6250] eta: 0:15:36 lr: 0.000042 grad: 0.1564 (0.1973) loss: 0.7838 (0.7893) time: 0.2672 data: 0.0002 max mem: 26157 Train: [62] [2900/6250] eta: 0:15:09 lr: 0.000042 grad: 0.1584 (0.1969) loss: 0.7728 (0.7891) time: 0.2681 data: 0.0002 max mem: 26157 Train: [62] [3000/6250] eta: 0:14:41 lr: 0.000042 grad: 0.1651 (0.1971) loss: 0.7884 (0.7890) time: 0.2690 data: 0.0001 max mem: 26157 Train: [62] [3100/6250] eta: 0:14:14 lr: 0.000042 grad: 0.1422 (0.1968) loss: 0.7939 (0.7890) time: 0.2691 data: 0.0002 max mem: 26157 Train: [62] [3200/6250] eta: 0:13:47 lr: 0.000042 grad: 0.1814 (0.1970) loss: 0.7826 (0.7890) time: 0.2676 data: 0.0002 max mem: 26157 Train: [62] [3300/6250] eta: 0:13:19 lr: 0.000042 grad: 0.1626 (0.1972) loss: 0.7846 (0.7889) time: 0.2681 data: 0.0001 max mem: 26157 Train: [62] [3400/6250] eta: 0:12:52 lr: 0.000042 grad: 0.1575 (0.1971) loss: 0.7904 (0.7888) time: 0.2703 data: 0.0002 max mem: 26157 Train: [62] [3500/6250] eta: 0:12:25 lr: 0.000042 grad: 0.1587 (0.1973) loss: 0.7904 (0.7888) time: 0.2684 data: 0.0001 max mem: 26157 Train: [62] [3600/6250] eta: 0:11:57 lr: 0.000042 grad: 0.1520 (0.1969) loss: 0.7926 (0.7887) time: 0.2677 data: 0.0002 max mem: 26157 Train: [62] [3700/6250] eta: 0:11:30 lr: 0.000042 grad: 0.1595 (0.1975) loss: 0.7861 (0.7886) time: 0.2677 data: 0.0002 max mem: 26157 Train: [62] [3800/6250] eta: 0:11:03 lr: 0.000042 grad: 0.1492 (0.1982) loss: 0.7893 (0.7886) time: 0.2672 data: 0.0002 max mem: 26157 Train: [62] [3900/6250] eta: 0:10:36 lr: 0.000042 grad: 0.1455 (0.1972) loss: 0.7885 (0.7885) time: 0.2699 data: 0.0002 max mem: 26157 Train: [62] [4000/6250] eta: 0:10:08 lr: 0.000042 grad: 0.1574 (0.1966) loss: 0.7861 (0.7885) time: 0.2691 data: 0.0002 max mem: 26157 Train: [62] [4100/6250] eta: 0:09:41 lr: 0.000042 grad: 0.1596 (0.1973) loss: 0.7869 (0.7884) time: 0.2679 data: 0.0002 max mem: 26157 Train: [62] [4200/6250] eta: 0:09:14 lr: 0.000042 grad: 0.1506 (0.1974) loss: 0.7818 (0.7885) time: 0.2688 data: 0.0001 max mem: 26157 Train: [62] [4300/6250] eta: 0:08:47 lr: 0.000042 grad: 0.1530 (0.1977) loss: 0.7883 (0.7885) time: 0.2698 data: 0.0002 max mem: 26157 Train: [62] [4400/6250] eta: 0:08:20 lr: 0.000042 grad: 0.1516 (0.1981) loss: 0.7916 (0.7886) time: 0.2709 data: 0.0002 max mem: 26157 Train: [62] [4500/6250] eta: 0:07:53 lr: 0.000042 grad: 0.1472 (0.1983) loss: 0.7905 (0.7886) time: 0.2676 data: 0.0002 max mem: 26157 Train: [62] [4600/6250] eta: 0:07:26 lr: 0.000042 grad: 0.1693 (0.1985) loss: 0.7931 (0.7887) time: 0.2695 data: 0.0002 max mem: 26157 Train: [62] [4700/6250] eta: 0:06:59 lr: 0.000042 grad: 0.1652 (0.1992) loss: 0.7896 (0.7888) time: 0.2680 data: 0.0002 max mem: 26157 Train: [62] [4800/6250] eta: 0:06:32 lr: 0.000042 grad: 0.1798 (0.1995) loss: 0.7935 (0.7889) time: 0.2697 data: 0.0002 max mem: 26157 Train: [62] [4900/6250] eta: 0:06:05 lr: 0.000042 grad: 0.1646 (0.2005) loss: 0.7982 (0.7890) time: 0.2692 data: 0.0002 max mem: 26157 Train: [62] [5000/6250] eta: 0:05:38 lr: 0.000042 grad: 0.1658 (0.2008) loss: 0.7859 (0.7890) time: 0.2678 data: 0.0002 max mem: 26157 Train: [62] [5100/6250] eta: 0:05:11 lr: 0.000042 grad: 0.1725 (0.2014) loss: 0.7867 (0.7889) time: 0.2700 data: 0.0002 max mem: 26157 Train: [62] [5200/6250] eta: 0:04:43 lr: 0.000042 grad: 0.1569 (0.2018) loss: 0.7932 (0.7890) time: 0.2690 data: 0.0002 max mem: 26157 Train: [62] [5300/6250] eta: 0:04:16 lr: 0.000042 grad: 0.1772 (0.2024) loss: 0.7858 (0.7890) time: 0.2692 data: 0.0002 max mem: 26157 Train: [62] [5400/6250] eta: 0:03:49 lr: 0.000041 grad: 0.1545 (0.2021) loss: 0.7971 (0.7890) time: 0.2683 data: 0.0002 max mem: 26157 Train: [62] [5500/6250] eta: 0:03:22 lr: 0.000041 grad: 0.1551 (0.2021) loss: 0.7877 (0.7889) time: 0.2694 data: 0.0002 max mem: 26157 Train: [62] [5600/6250] eta: 0:02:55 lr: 0.000041 grad: 0.1644 (0.2023) loss: 0.7923 (0.7889) time: 0.2686 data: 0.0002 max mem: 26157 Train: [62] [5700/6250] eta: 0:02:28 lr: 0.000041 grad: 0.1524 (0.2020) loss: 0.7880 (0.7889) time: 0.2685 data: 0.0002 max mem: 26157 Train: [62] [5800/6250] eta: 0:02:01 lr: 0.000041 grad: 0.1738 (0.2020) loss: 0.7895 (0.7889) time: 0.2680 data: 0.0001 max mem: 26157 Train: [62] [5900/6250] eta: 0:01:34 lr: 0.000041 grad: 0.1671 (0.2022) loss: 0.7920 (0.7889) time: 0.2690 data: 0.0002 max mem: 26157 Train: [62] [6000/6250] eta: 0:01:07 lr: 0.000041 grad: 0.1740 (0.2031) loss: 0.7992 (0.7889) time: 0.2691 data: 0.0002 max mem: 26157 Train: [62] [6100/6250] eta: 0:00:40 lr: 0.000041 grad: 0.1611 (0.2030) loss: 0.7877 (0.7890) time: 0.2689 data: 0.0002 max mem: 26157 Train: [62] [6200/6250] eta: 0:00:13 lr: 0.000041 grad: 0.1570 (0.2030) loss: 0.7891 (0.7890) time: 0.2675 data: 0.0001 max mem: 26157 Train: [62] [6249/6250] eta: 0:00:00 lr: 0.000041 grad: 0.1587 (0.2030) loss: 0.7887 (0.7890) time: 0.2679 data: 0.0001 max mem: 26157 Train: [62] Total time: 0:28:14 (0.2712 s / it) Averaged stats: lr: 0.000041 grad: 0.1587 (0.2030) loss: 0.7887 (0.7890) Eval (hcp-train-subset): [62] [ 0/62] eta: 0:03:29 loss: 0.8228 (0.8228) time: 3.3726 data: 3.2615 max mem: 26157 Eval (hcp-train-subset): [62] [61/62] eta: 0:00:00 loss: 0.8174 (0.8165) time: 0.1021 data: 0.0174 max mem: 26157 Eval (hcp-train-subset): [62] Total time: 0:00:12 (0.1997 s / it) Averaged stats (hcp-train-subset): loss: 0.8174 (0.8165) Making plots (hcp-train-subset): example=45 Eval (hcp-val): [62] [ 0/62] eta: 0:04:45 loss: 0.8242 (0.8242) time: 4.6059 data: 4.5214 max mem: 26157 Eval (hcp-val): [62] [61/62] eta: 0:00:00 loss: 0.8235 (0.8249) time: 0.1298 data: 0.0452 max mem: 26157 Eval (hcp-val): [62] Total time: 0:00:13 (0.2106 s / it) Averaged stats (hcp-val): loss: 0.8235 (0.8249) Making plots (hcp-val): example=58 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [63] [ 0/6250] eta: 9:08:42 lr: 0.000041 grad: 0.1585 (0.1585) loss: 0.8331 (0.8331) time: 5.2676 data: 4.9333 max mem: 26157 Train: [63] [ 100/6250] eta: 0:33:16 lr: 0.000041 grad: 0.1830 (0.2186) loss: 0.7935 (0.7999) time: 0.2693 data: 0.0002 max mem: 26157 Train: [63] [ 200/6250] eta: 0:29:59 lr: 0.000041 grad: 0.1578 (0.2031) loss: 0.7860 (0.7965) time: 0.2703 data: 0.0002 max mem: 26157 Train: [63] [ 300/6250] eta: 0:28:34 lr: 0.000041 grad: 0.1557 (0.1943) loss: 0.7879 (0.7960) time: 0.2717 data: 0.0002 max mem: 26157 Train: [63] [ 400/6250] eta: 0:27:37 lr: 0.000041 grad: 0.1517 (0.1853) loss: 0.7971 (0.7966) time: 0.2680 data: 0.0002 max mem: 26157 Train: [63] [ 500/6250] eta: 0:26:52 lr: 0.000041 grad: 0.1511 (0.1829) loss: 0.7954 (0.7971) time: 0.2675 data: 0.0002 max mem: 26157 Train: [63] [ 600/6250] eta: 0:26:13 lr: 0.000041 grad: 0.1333 (0.1839) loss: 0.8041 (0.7976) time: 0.2683 data: 0.0001 max mem: 26157 Train: [63] [ 700/6250] eta: 0:25:38 lr: 0.000041 grad: 0.1408 (0.1796) loss: 0.8043 (0.7978) time: 0.2685 data: 0.0002 max mem: 26157 Train: [63] [ 800/6250] eta: 0:25:04 lr: 0.000041 grad: 0.1523 (0.1779) loss: 0.7989 (0.7980) time: 0.2692 data: 0.0002 max mem: 26157 Train: [63] [ 900/6250] eta: 0:24:32 lr: 0.000041 grad: 0.1527 (0.1804) loss: 0.8015 (0.7980) time: 0.2684 data: 0.0002 max mem: 26157 Train: [63] [1000/6250] eta: 0:24:02 lr: 0.000041 grad: 0.1630 (0.1806) loss: 0.7939 (0.7980) time: 0.2717 data: 0.0002 max mem: 26157 Train: [63] [1100/6250] eta: 0:23:32 lr: 0.000041 grad: 0.1589 (0.1801) loss: 0.7889 (0.7975) time: 0.2685 data: 0.0002 max mem: 26157 Train: [63] [1200/6250] eta: 0:23:02 lr: 0.000041 grad: 0.1538 (0.1807) loss: 0.7913 (0.7971) time: 0.2678 data: 0.0002 max mem: 26157 Train: [63] [1300/6250] eta: 0:22:32 lr: 0.000041 grad: 0.1575 (0.1807) loss: 0.7851 (0.7963) time: 0.2677 data: 0.0001 max mem: 26157 Train: [63] [1400/6250] eta: 0:22:03 lr: 0.000041 grad: 0.1616 (0.1817) loss: 0.7836 (0.7955) time: 0.2688 data: 0.0002 max mem: 26157 Train: [63] [1500/6250] eta: 0:21:35 lr: 0.000041 grad: 0.1838 (0.1827) loss: 0.7808 (0.7949) time: 0.2674 data: 0.0002 max mem: 26157 Train: [63] [1600/6250] eta: 0:21:06 lr: 0.000041 grad: 0.1626 (0.1829) loss: 0.7897 (0.7944) time: 0.2669 data: 0.0002 max mem: 26157 Train: [63] [1700/6250] eta: 0:20:38 lr: 0.000041 grad: 0.1677 (0.1831) loss: 0.7874 (0.7940) time: 0.2675 data: 0.0002 max mem: 26157 Train: [63] [1800/6250] eta: 0:20:09 lr: 0.000041 grad: 0.1577 (0.1831) loss: 0.7837 (0.7937) time: 0.2674 data: 0.0002 max mem: 26157 Train: [63] [1900/6250] eta: 0:19:41 lr: 0.000041 grad: 0.1575 (0.1827) loss: 0.7799 (0.7934) time: 0.2679 data: 0.0002 max mem: 26157 Train: [63] [2000/6250] eta: 0:19:13 lr: 0.000041 grad: 0.1591 (0.1835) loss: 0.7906 (0.7930) time: 0.2680 data: 0.0002 max mem: 26157 Train: [63] [2100/6250] eta: 0:18:46 lr: 0.000041 grad: 0.1551 (0.1839) loss: 0.7864 (0.7928) time: 0.2687 data: 0.0002 max mem: 26157 Train: [63] [2200/6250] eta: 0:18:18 lr: 0.000041 grad: 0.1934 (0.1849) loss: 0.7837 (0.7925) time: 0.2691 data: 0.0002 max mem: 26157 Train: [63] [2300/6250] eta: 0:17:50 lr: 0.000041 grad: 0.1750 (0.1852) loss: 0.7864 (0.7924) time: 0.2675 data: 0.0002 max mem: 26157 Train: [63] [2400/6250] eta: 0:17:23 lr: 0.000040 grad: 0.1624 (0.1852) loss: 0.7828 (0.7921) time: 0.2684 data: 0.0002 max mem: 26157 Train: [63] [2500/6250] eta: 0:16:55 lr: 0.000040 grad: 0.1652 (0.1861) loss: 0.7886 (0.7918) time: 0.2677 data: 0.0002 max mem: 26157 Train: [63] [2600/6250] eta: 0:16:28 lr: 0.000040 grad: 0.1819 (0.1866) loss: 0.7839 (0.7916) time: 0.2679 data: 0.0002 max mem: 26157 Train: [63] [2700/6250] eta: 0:16:01 lr: 0.000040 grad: 0.1655 (0.1870) loss: 0.7864 (0.7915) time: 0.2683 data: 0.0002 max mem: 26157 Train: [63] [2800/6250] eta: 0:15:33 lr: 0.000040 grad: 0.1711 (0.1874) loss: 0.7910 (0.7914) time: 0.2677 data: 0.0001 max mem: 26157 Train: [63] [2900/6250] eta: 0:15:06 lr: 0.000040 grad: 0.1688 (0.1877) loss: 0.7846 (0.7913) time: 0.2677 data: 0.0001 max mem: 26157 Train: [63] [3000/6250] eta: 0:14:38 lr: 0.000040 grad: 0.1527 (0.1881) loss: 0.7784 (0.7912) time: 0.2680 data: 0.0001 max mem: 26157 Train: [63] [3100/6250] eta: 0:14:11 lr: 0.000040 grad: 0.1646 (0.1894) loss: 0.7840 (0.7911) time: 0.2680 data: 0.0001 max mem: 26157 Train: [63] [3200/6250] eta: 0:13:44 lr: 0.000040 grad: 0.1573 (0.1896) loss: 0.7881 (0.7910) time: 0.2690 data: 0.0002 max mem: 26157 Train: [63] [3300/6250] eta: 0:13:17 lr: 0.000040 grad: 0.1647 (0.1900) loss: 0.7859 (0.7909) time: 0.2703 data: 0.0002 max mem: 26157 Train: [63] [3400/6250] eta: 0:12:49 lr: 0.000040 grad: 0.1534 (0.1896) loss: 0.7861 (0.7908) time: 0.2684 data: 0.0001 max mem: 26157 Train: [63] [3500/6250] eta: 0:12:22 lr: 0.000040 grad: 0.1588 (0.1894) loss: 0.7839 (0.7907) time: 0.2689 data: 0.0002 max mem: 26157 Train: [63] [3600/6250] eta: 0:11:55 lr: 0.000040 grad: 0.1406 (0.1898) loss: 0.7917 (0.7907) time: 0.2705 data: 0.0001 max mem: 26157 Train: [63] [3700/6250] eta: 0:11:28 lr: 0.000040 grad: 0.1638 (0.1896) loss: 0.7938 (0.7907) time: 0.2687 data: 0.0002 max mem: 26157 Train: [63] [3800/6250] eta: 0:11:01 lr: 0.000040 grad: 0.1581 (0.1900) loss: 0.7908 (0.7908) time: 0.2694 data: 0.0002 max mem: 26157 Train: [63] [3900/6250] eta: 0:10:34 lr: 0.000040 grad: 0.1497 (0.1902) loss: 0.7834 (0.7907) time: 0.2704 data: 0.0002 max mem: 26157 Train: [63] [4000/6250] eta: 0:10:07 lr: 0.000040 grad: 0.1686 (0.1906) loss: 0.7868 (0.7906) time: 0.2688 data: 0.0001 max mem: 26157 Train: [63] [4100/6250] eta: 0:09:40 lr: 0.000040 grad: 0.1586 (0.1909) loss: 0.7894 (0.7906) time: 0.2685 data: 0.0001 max mem: 26157 Train: [63] [4200/6250] eta: 0:09:13 lr: 0.000040 grad: 0.1674 (0.1914) loss: 0.7903 (0.7907) time: 0.2722 data: 0.0002 max mem: 26157 Train: [63] [4300/6250] eta: 0:08:46 lr: 0.000040 grad: 0.1566 (0.1914) loss: 0.7938 (0.7906) time: 0.2698 data: 0.0002 max mem: 26157 Train: [63] [4400/6250] eta: 0:08:19 lr: 0.000040 grad: 0.1503 (0.1926) loss: 0.7878 (0.7906) time: 0.2686 data: 0.0002 max mem: 26157 Train: [63] [4500/6250] eta: 0:07:52 lr: 0.000040 grad: 0.1617 (0.1922) loss: 0.7898 (0.7905) time: 0.2692 data: 0.0002 max mem: 26157 Train: [63] [4600/6250] eta: 0:07:25 lr: 0.000040 grad: 0.1514 (0.1928) loss: 0.7937 (0.7905) time: 0.2700 data: 0.0002 max mem: 26157 Train: [63] [4700/6250] eta: 0:06:58 lr: 0.000040 grad: 0.1516 (0.1931) loss: 0.7912 (0.7905) time: 0.2689 data: 0.0002 max mem: 26157 Train: [63] [4800/6250] eta: 0:06:31 lr: 0.000040 grad: 0.1642 (0.1930) loss: 0.7906 (0.7905) time: 0.2698 data: 0.0002 max mem: 26157 Train: [63] [4900/6250] eta: 0:06:04 lr: 0.000040 grad: 0.1636 (0.1932) loss: 0.7867 (0.7905) time: 0.2705 data: 0.0002 max mem: 26157 Train: [63] [5000/6250] eta: 0:05:37 lr: 0.000040 grad: 0.1487 (0.1929) loss: 0.7900 (0.7905) time: 0.2692 data: 0.0002 max mem: 26157 Train: [63] [5100/6250] eta: 0:05:10 lr: 0.000040 grad: 0.1632 (0.1930) loss: 0.7841 (0.7904) time: 0.2692 data: 0.0002 max mem: 26157 Train: [63] [5200/6250] eta: 0:04:43 lr: 0.000040 grad: 0.1525 (0.1928) loss: 0.7913 (0.7904) time: 0.2689 data: 0.0001 max mem: 26157 Train: [63] [5300/6250] eta: 0:04:16 lr: 0.000040 grad: 0.1560 (0.1931) loss: 0.7902 (0.7903) time: 0.2701 data: 0.0002 max mem: 26157 Train: [63] [5400/6250] eta: 0:03:49 lr: 0.000040 grad: 0.1527 (0.1928) loss: 0.7825 (0.7903) time: 0.2690 data: 0.0001 max mem: 26157 Train: [63] [5500/6250] eta: 0:03:22 lr: 0.000040 grad: 0.1687 (0.1929) loss: 0.7870 (0.7903) time: 0.2690 data: 0.0002 max mem: 26157 Train: [63] [5600/6250] eta: 0:02:55 lr: 0.000039 grad: 0.1684 (0.1932) loss: 0.7813 (0.7903) time: 0.2697 data: 0.0002 max mem: 26157 Train: [63] [5700/6250] eta: 0:02:28 lr: 0.000039 grad: 0.1621 (0.1932) loss: 0.7857 (0.7902) time: 0.2697 data: 0.0002 max mem: 26157 Train: [63] [5800/6250] eta: 0:02:01 lr: 0.000039 grad: 0.1575 (0.1933) loss: 0.7905 (0.7902) time: 0.2704 data: 0.0002 max mem: 26157 Train: [63] [5900/6250] eta: 0:01:34 lr: 0.000039 grad: 0.1579 (0.1930) loss: 0.7882 (0.7901) time: 0.2693 data: 0.0002 max mem: 26157 Train: [63] [6000/6250] eta: 0:01:07 lr: 0.000039 grad: 0.1656 (0.1935) loss: 0.7892 (0.7901) time: 0.2709 data: 0.0001 max mem: 26157 Train: [63] [6100/6250] eta: 0:00:40 lr: 0.000039 grad: 0.1641 (0.1935) loss: 0.7829 (0.7900) time: 0.2693 data: 0.0002 max mem: 26157 Train: [63] [6200/6250] eta: 0:00:13 lr: 0.000039 grad: 0.1593 (0.1937) loss: 0.7888 (0.7900) time: 0.2699 data: 0.0002 max mem: 26157 Train: [63] [6249/6250] eta: 0:00:00 lr: 0.000039 grad: 0.1610 (0.1936) loss: 0.7832 (0.7900) time: 0.2702 data: 0.0002 max mem: 26157 Train: [63] Total time: 0:28:12 (0.2708 s / it) Averaged stats: lr: 0.000039 grad: 0.1610 (0.1936) loss: 0.7832 (0.7900) Eval (hcp-train-subset): [63] [ 0/62] eta: 0:04:04 loss: 0.8144 (0.8144) time: 3.9418 data: 3.8364 max mem: 26157 Eval (hcp-train-subset): [63] [61/62] eta: 0:00:00 loss: 0.8140 (0.8150) time: 0.1236 data: 0.0390 max mem: 26157 Eval (hcp-train-subset): [63] Total time: 0:00:12 (0.1990 s / it) Averaged stats (hcp-train-subset): loss: 0.8140 (0.8150) Making plots (hcp-train-subset): example=46 Eval (hcp-val): [63] [ 0/62] eta: 0:05:26 loss: 0.8214 (0.8214) time: 5.2643 data: 5.1808 max mem: 26157 Eval (hcp-val): [63] [61/62] eta: 0:00:00 loss: 0.8237 (0.8249) time: 0.1085 data: 0.0240 max mem: 26157 Eval (hcp-val): [63] Total time: 0:00:12 (0.1981 s / it) Averaged stats (hcp-val): loss: 0.8237 (0.8249) Making plots (hcp-val): example=46 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [64] [ 0/6250] eta: 9:55:47 lr: 0.000039 grad: 0.1823 (0.1823) loss: 0.8267 (0.8267) time: 5.7195 data: 5.4430 max mem: 26157 Train: [64] [ 100/6250] eta: 0:33:09 lr: 0.000039 grad: 0.1599 (0.2203) loss: 0.8048 (0.8060) time: 0.2685 data: 0.0002 max mem: 26157 Train: [64] [ 200/6250] eta: 0:29:52 lr: 0.000039 grad: 0.1700 (0.2087) loss: 0.7896 (0.7987) time: 0.2683 data: 0.0002 max mem: 26157 Train: [64] [ 300/6250] eta: 0:28:28 lr: 0.000039 grad: 0.1608 (0.1968) loss: 0.7927 (0.7987) time: 0.2692 data: 0.0002 max mem: 26157 Train: [64] [ 400/6250] eta: 0:27:32 lr: 0.000039 grad: 0.1354 (0.1902) loss: 0.8010 (0.7980) time: 0.2696 data: 0.0002 max mem: 26157 Train: [64] [ 500/6250] eta: 0:26:48 lr: 0.000039 grad: 0.1497 (0.1863) loss: 0.7939 (0.7977) time: 0.2682 data: 0.0002 max mem: 26157 Train: [64] [ 600/6250] eta: 0:26:09 lr: 0.000039 grad: 0.1549 (0.1828) loss: 0.7987 (0.7981) time: 0.2693 data: 0.0002 max mem: 26157 Train: [64] [ 700/6250] eta: 0:25:34 lr: 0.000039 grad: 0.1586 (0.1829) loss: 0.7960 (0.7977) time: 0.2685 data: 0.0002 max mem: 26157 Train: [64] [ 800/6250] eta: 0:25:06 lr: 0.000039 grad: 0.1480 (0.1830) loss: 0.7909 (0.7969) time: 0.2690 data: 0.0002 max mem: 26157 Train: [64] [ 900/6250] eta: 0:24:34 lr: 0.000039 grad: 0.1493 (0.1832) loss: 0.7996 (0.7964) time: 0.2707 data: 0.0002 max mem: 26157 Train: [64] [1000/6250] eta: 0:24:03 lr: 0.000039 grad: 0.1351 (0.1818) loss: 0.7856 (0.7957) time: 0.2685 data: 0.0002 max mem: 26157 Train: [64] [1100/6250] eta: 0:23:32 lr: 0.000039 grad: 0.1531 (0.1811) loss: 0.7895 (0.7950) time: 0.2678 data: 0.0002 max mem: 26157 Train: [64] [1200/6250] eta: 0:23:02 lr: 0.000039 grad: 0.1523 (0.1811) loss: 0.7879 (0.7944) time: 0.2681 data: 0.0002 max mem: 26157 Train: [64] [1300/6250] eta: 0:22:33 lr: 0.000039 grad: 0.1613 (0.1810) loss: 0.7947 (0.7941) time: 0.2682 data: 0.0002 max mem: 26157 Train: [64] [1400/6250] eta: 0:22:04 lr: 0.000039 grad: 0.1366 (0.1800) loss: 0.7913 (0.7938) time: 0.2702 data: 0.0002 max mem: 26157 Train: [64] [1500/6250] eta: 0:21:35 lr: 0.000039 grad: 0.1414 (0.1804) loss: 0.7809 (0.7935) time: 0.2707 data: 0.0002 max mem: 26157 Train: [64] [1600/6250] eta: 0:21:07 lr: 0.000039 grad: 0.1522 (0.1803) loss: 0.7918 (0.7931) time: 0.2712 data: 0.0002 max mem: 26157 Train: [64] [1700/6250] eta: 0:20:39 lr: 0.000039 grad: 0.1447 (0.1809) loss: 0.7863 (0.7928) time: 0.2683 data: 0.0002 max mem: 26157 Train: [64] [1800/6250] eta: 0:20:11 lr: 0.000039 grad: 0.1668 (0.1826) loss: 0.7838 (0.7924) time: 0.2678 data: 0.0002 max mem: 26157 Train: [64] [1900/6250] eta: 0:19:43 lr: 0.000039 grad: 0.1445 (0.1829) loss: 0.7898 (0.7921) time: 0.2690 data: 0.0002 max mem: 26157 Train: [64] [2000/6250] eta: 0:19:15 lr: 0.000039 grad: 0.1437 (0.1843) loss: 0.7972 (0.7919) time: 0.2688 data: 0.0002 max mem: 26157 Train: [64] [2100/6250] eta: 0:18:47 lr: 0.000039 grad: 0.1646 (0.1839) loss: 0.7820 (0.7916) time: 0.2684 data: 0.0002 max mem: 26157 Train: [64] [2200/6250] eta: 0:18:19 lr: 0.000039 grad: 0.1467 (0.1829) loss: 0.7863 (0.7914) time: 0.2672 data: 0.0001 max mem: 26157 Train: [64] [2300/6250] eta: 0:17:52 lr: 0.000039 grad: 0.1498 (0.1835) loss: 0.7952 (0.7914) time: 0.2692 data: 0.0002 max mem: 26157 Train: [64] [2400/6250] eta: 0:17:24 lr: 0.000039 grad: 0.1562 (0.1842) loss: 0.7938 (0.7912) time: 0.2690 data: 0.0002 max mem: 26157 Train: [64] [2500/6250] eta: 0:16:57 lr: 0.000039 grad: 0.1679 (0.1862) loss: 0.7801 (0.7910) time: 0.2685 data: 0.0002 max mem: 26157 Train: [64] [2600/6250] eta: 0:16:29 lr: 0.000039 grad: 0.1536 (0.1860) loss: 0.7880 (0.7909) time: 0.2682 data: 0.0002 max mem: 26157 Train: [64] [2700/6250] eta: 0:16:02 lr: 0.000038 grad: 0.1512 (0.1864) loss: 0.7868 (0.7907) time: 0.2677 data: 0.0002 max mem: 26157 Train: [64] [2800/6250] eta: 0:15:34 lr: 0.000038 grad: 0.1495 (0.1868) loss: 0.7880 (0.7906) time: 0.2684 data: 0.0002 max mem: 26157 Train: [64] [2900/6250] eta: 0:15:07 lr: 0.000038 grad: 0.1495 (0.1862) loss: 0.7879 (0.7904) time: 0.2687 data: 0.0002 max mem: 26157 Train: [64] [3000/6250] eta: 0:14:40 lr: 0.000038 grad: 0.1533 (0.1861) loss: 0.7859 (0.7903) time: 0.2673 data: 0.0002 max mem: 26157 Train: [64] [3100/6250] eta: 0:14:12 lr: 0.000038 grad: 0.1581 (0.1860) loss: 0.7848 (0.7903) time: 0.2694 data: 0.0002 max mem: 26157 Train: [64] [3200/6250] eta: 0:13:45 lr: 0.000038 grad: 0.1524 (0.1862) loss: 0.7804 (0.7902) time: 0.2681 data: 0.0002 max mem: 26157 Train: [64] [3300/6250] eta: 0:13:18 lr: 0.000038 grad: 0.1479 (0.1868) loss: 0.7876 (0.7901) time: 0.2673 data: 0.0002 max mem: 26157 Train: [64] [3400/6250] eta: 0:12:50 lr: 0.000038 grad: 0.1481 (0.1876) loss: 0.7882 (0.7900) time: 0.2686 data: 0.0002 max mem: 26157 Train: [64] [3500/6250] eta: 0:12:23 lr: 0.000038 grad: 0.1524 (0.1877) loss: 0.7883 (0.7898) time: 0.2691 data: 0.0002 max mem: 26157 Train: [64] [3600/6250] eta: 0:11:56 lr: 0.000038 grad: 0.1509 (0.1882) loss: 0.7896 (0.7896) time: 0.2686 data: 0.0002 max mem: 26157 Train: [64] [3700/6250] eta: 0:11:29 lr: 0.000038 grad: 0.1544 (0.1890) loss: 0.7890 (0.7896) time: 0.2683 data: 0.0002 max mem: 26157 Train: [64] [3800/6250] eta: 0:11:02 lr: 0.000038 grad: 0.1572 (0.1891) loss: 0.7916 (0.7896) time: 0.2698 data: 0.0002 max mem: 26157 Train: [64] [3900/6250] eta: 0:10:35 lr: 0.000038 grad: 0.1538 (0.1898) loss: 0.7863 (0.7896) time: 0.2693 data: 0.0002 max mem: 26157 Train: [64] [4000/6250] eta: 0:10:08 lr: 0.000038 grad: 0.1528 (0.1899) loss: 0.7883 (0.7896) time: 0.2670 data: 0.0002 max mem: 26157 Train: [64] [4100/6250] eta: 0:09:40 lr: 0.000038 grad: 0.1556 (0.1909) loss: 0.7869 (0.7896) time: 0.2685 data: 0.0001 max mem: 26157 Train: [64] [4200/6250] eta: 0:09:13 lr: 0.000038 grad: 0.1547 (0.1914) loss: 0.7885 (0.7896) time: 0.2710 data: 0.0002 max mem: 26157 Train: [64] [4300/6250] eta: 0:08:46 lr: 0.000038 grad: 0.1724 (0.1916) loss: 0.7837 (0.7896) time: 0.2683 data: 0.0002 max mem: 26157 Train: [64] [4400/6250] eta: 0:08:19 lr: 0.000038 grad: 0.1735 (0.1921) loss: 0.7888 (0.7895) time: 0.2684 data: 0.0002 max mem: 26157 Train: [64] [4500/6250] eta: 0:07:52 lr: 0.000038 grad: 0.1567 (0.1924) loss: 0.7862 (0.7895) time: 0.2683 data: 0.0002 max mem: 26157 Train: [64] [4600/6250] eta: 0:07:25 lr: 0.000038 grad: 0.1578 (0.1921) loss: 0.7842 (0.7894) time: 0.2686 data: 0.0002 max mem: 26157 Train: [64] [4700/6250] eta: 0:06:58 lr: 0.000038 grad: 0.1806 (0.1929) loss: 0.7785 (0.7893) time: 0.2679 data: 0.0001 max mem: 26157 Train: [64] [4800/6250] eta: 0:06:31 lr: 0.000038 grad: 0.1700 (0.1936) loss: 0.7816 (0.7892) time: 0.2686 data: 0.0002 max mem: 26157 Train: [64] [4900/6250] eta: 0:06:04 lr: 0.000038 grad: 0.1572 (0.1934) loss: 0.7836 (0.7890) time: 0.2711 data: 0.0002 max mem: 26157 Train: [64] [5000/6250] eta: 0:05:37 lr: 0.000038 grad: 0.1546 (0.1934) loss: 0.7829 (0.7888) time: 0.2687 data: 0.0002 max mem: 26157 Train: [64] [5100/6250] eta: 0:05:10 lr: 0.000038 grad: 0.1667 (0.1937) loss: 0.7798 (0.7887) time: 0.2683 data: 0.0002 max mem: 26157 Train: [64] [5200/6250] eta: 0:04:43 lr: 0.000038 grad: 0.1474 (0.1936) loss: 0.7877 (0.7886) time: 0.2697 data: 0.0002 max mem: 26157 Train: [64] [5300/6250] eta: 0:04:16 lr: 0.000038 grad: 0.1766 (0.1938) loss: 0.7768 (0.7885) time: 0.2680 data: 0.0002 max mem: 26157 Train: [64] [5400/6250] eta: 0:03:49 lr: 0.000038 grad: 0.1672 (0.1937) loss: 0.7862 (0.7884) time: 0.2683 data: 0.0002 max mem: 26157 Train: [64] [5500/6250] eta: 0:03:22 lr: 0.000038 grad: 0.1532 (0.1939) loss: 0.7892 (0.7884) time: 0.2681 data: 0.0002 max mem: 26157 Train: [64] [5600/6250] eta: 0:02:55 lr: 0.000038 grad: 0.1681 (0.1937) loss: 0.7866 (0.7883) time: 0.2684 data: 0.0002 max mem: 26157 Train: [64] [5700/6250] eta: 0:02:28 lr: 0.000038 grad: 0.1629 (0.1935) loss: 0.7824 (0.7883) time: 0.2685 data: 0.0002 max mem: 26157 Train: [64] [5800/6250] eta: 0:02:01 lr: 0.000038 grad: 0.1668 (0.1940) loss: 0.7892 (0.7883) time: 0.2689 data: 0.0002 max mem: 26157 Train: [64] [5900/6250] eta: 0:01:34 lr: 0.000037 grad: 0.1619 (0.1941) loss: 0.7855 (0.7883) time: 0.2693 data: 0.0002 max mem: 26157 Train: [64] [6000/6250] eta: 0:01:07 lr: 0.000037 grad: 0.1836 (0.1948) loss: 0.7854 (0.7883) time: 0.2684 data: 0.0001 max mem: 26157 Train: [64] [6100/6250] eta: 0:00:40 lr: 0.000037 grad: 0.1607 (0.1954) loss: 0.7822 (0.7882) time: 0.2684 data: 0.0002 max mem: 26157 Train: [64] [6200/6250] eta: 0:00:13 lr: 0.000037 grad: 0.1789 (0.1961) loss: 0.7869 (0.7882) time: 0.2689 data: 0.0002 max mem: 26157 Train: [64] [6249/6250] eta: 0:00:00 lr: 0.000037 grad: 0.1501 (0.1961) loss: 0.7840 (0.7882) time: 0.2687 data: 0.0002 max mem: 26157 Train: [64] Total time: 0:28:11 (0.2706 s / it) Averaged stats: lr: 0.000037 grad: 0.1501 (0.1961) loss: 0.7840 (0.7882) Eval (hcp-train-subset): [64] [ 0/62] eta: 0:05:03 loss: 0.8213 (0.8213) time: 4.8912 data: 4.8076 max mem: 26157 Eval (hcp-train-subset): [64] [61/62] eta: 0:00:00 loss: 0.8102 (0.8127) time: 0.1176 data: 0.0346 max mem: 26157 Eval (hcp-train-subset): [64] Total time: 0:00:11 (0.1889 s / it) Averaged stats (hcp-train-subset): loss: 0.8102 (0.8127) Making plots (hcp-train-subset): example=10 Eval (hcp-val): [64] [ 0/62] eta: 0:03:20 loss: 0.8227 (0.8227) time: 3.2347 data: 3.1213 max mem: 26157 Eval (hcp-val): [64] [61/62] eta: 0:00:00 loss: 0.8242 (0.8254) time: 0.1170 data: 0.0346 max mem: 26157 Eval (hcp-val): [64] Total time: 0:00:11 (0.1872 s / it) Averaged stats (hcp-val): loss: 0.8242 (0.8254) Making plots (hcp-val): example=32 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [65] [ 0/6250] eta: 6:45:55 lr: 0.000037 grad: 0.1111 (0.1111) loss: 0.8515 (0.8515) time: 3.8969 data: 3.5636 max mem: 26157 Train: [65] [ 100/6250] eta: 0:32:29 lr: 0.000037 grad: 0.1761 (0.2166) loss: 0.8105 (0.8126) time: 0.2706 data: 0.0002 max mem: 26157 Train: [65] [ 200/6250] eta: 0:29:32 lr: 0.000037 grad: 0.1461 (0.2118) loss: 0.8143 (0.8101) time: 0.2680 data: 0.0002 max mem: 26157 Train: [65] [ 300/6250] eta: 0:28:14 lr: 0.000037 grad: 0.1629 (0.1991) loss: 0.7965 (0.8059) time: 0.2692 data: 0.0002 max mem: 26157 Train: [65] [ 400/6250] eta: 0:27:22 lr: 0.000037 grad: 0.1773 (0.1957) loss: 0.7875 (0.8019) time: 0.2693 data: 0.0002 max mem: 26157 Train: [65] [ 500/6250] eta: 0:26:40 lr: 0.000037 grad: 0.1557 (0.1940) loss: 0.7952 (0.7997) time: 0.2679 data: 0.0002 max mem: 26157 Train: [65] [ 600/6250] eta: 0:26:03 lr: 0.000037 grad: 0.1730 (0.1938) loss: 0.7884 (0.7977) time: 0.2689 data: 0.0001 max mem: 26157 Train: [65] [ 700/6250] eta: 0:25:29 lr: 0.000037 grad: 0.1602 (0.1941) loss: 0.7866 (0.7959) time: 0.2685 data: 0.0002 max mem: 26157 Train: [65] [ 800/6250] eta: 0:24:57 lr: 0.000037 grad: 0.1528 (0.1933) loss: 0.7862 (0.7949) time: 0.2684 data: 0.0002 max mem: 26157 Train: [65] [ 900/6250] eta: 0:24:26 lr: 0.000037 grad: 0.1575 (0.1907) loss: 0.7855 (0.7943) time: 0.2678 data: 0.0001 max mem: 26157 Train: [65] [1000/6250] eta: 0:23:56 lr: 0.000037 grad: 0.1489 (0.1905) loss: 0.7896 (0.7941) time: 0.2680 data: 0.0001 max mem: 26157 Train: [65] [1100/6250] eta: 0:23:26 lr: 0.000037 grad: 0.1719 (0.1909) loss: 0.7901 (0.7936) time: 0.2685 data: 0.0002 max mem: 26157 Train: [65] [1200/6250] eta: 0:22:57 lr: 0.000037 grad: 0.1577 (0.1922) loss: 0.7855 (0.7931) time: 0.2689 data: 0.0002 max mem: 26157 Train: [65] [1300/6250] eta: 0:22:28 lr: 0.000037 grad: 0.1627 (0.1951) loss: 0.7913 (0.7925) time: 0.2691 data: 0.0002 max mem: 26157 Train: [65] [1400/6250] eta: 0:22:00 lr: 0.000037 grad: 0.1571 (0.1967) loss: 0.7819 (0.7919) time: 0.2697 data: 0.0002 max mem: 26157 Train: [65] [1500/6250] eta: 0:21:31 lr: 0.000037 grad: 0.1562 (0.1962) loss: 0.7887 (0.7916) time: 0.2679 data: 0.0002 max mem: 26157 Train: [65] [1600/6250] eta: 0:21:03 lr: 0.000037 grad: 0.1556 (0.1958) loss: 0.7871 (0.7911) time: 0.2684 data: 0.0002 max mem: 26157 Train: [65] [1700/6250] eta: 0:20:35 lr: 0.000037 grad: 0.1636 (0.1957) loss: 0.7929 (0.7909) time: 0.2679 data: 0.0001 max mem: 26157 Train: [65] [1800/6250] eta: 0:20:07 lr: 0.000037 grad: 0.1795 (0.1966) loss: 0.7820 (0.7904) time: 0.2680 data: 0.0002 max mem: 26157 Train: [65] [1900/6250] eta: 0:19:39 lr: 0.000037 grad: 0.1530 (0.1982) loss: 0.7841 (0.7902) time: 0.2687 data: 0.0002 max mem: 26157 Train: [65] [2000/6250] eta: 0:19:11 lr: 0.000037 grad: 0.1564 (0.1984) loss: 0.7921 (0.7901) time: 0.2690 data: 0.0002 max mem: 26157 Train: [65] [2100/6250] eta: 0:18:44 lr: 0.000037 grad: 0.1580 (0.2021) loss: 0.7869 (0.7900) time: 0.2693 data: 0.0002 max mem: 26157 Train: [65] [2200/6250] eta: 0:18:16 lr: 0.000037 grad: 0.1694 (0.2045) loss: 0.7905 (0.7898) time: 0.2678 data: 0.0002 max mem: 26157 Train: [65] [2300/6250] eta: 0:17:49 lr: 0.000037 grad: 0.2151 (0.2088) loss: 0.7899 (0.7898) time: 0.2682 data: 0.0001 max mem: 26157 Train: [65] [2400/6250] eta: 0:17:21 lr: 0.000037 grad: 0.1739 (0.2093) loss: 0.7895 (0.7897) time: 0.2690 data: 0.0002 max mem: 26157 Train: [65] [2500/6250] eta: 0:16:54 lr: 0.000037 grad: 0.1672 (0.2087) loss: 0.7873 (0.7896) time: 0.2672 data: 0.0002 max mem: 26157 Train: [65] [2600/6250] eta: 0:16:26 lr: 0.000037 grad: 0.1686 (0.2086) loss: 0.7861 (0.7894) time: 0.2697 data: 0.0002 max mem: 26157 Train: [65] [2700/6250] eta: 0:15:59 lr: 0.000037 grad: 0.1584 (0.2077) loss: 0.7909 (0.7894) time: 0.2678 data: 0.0002 max mem: 26157 Train: [65] [2800/6250] eta: 0:15:32 lr: 0.000037 grad: 0.1714 (0.2071) loss: 0.7843 (0.7892) time: 0.2682 data: 0.0002 max mem: 26157 Train: [65] [2900/6250] eta: 0:15:05 lr: 0.000037 grad: 0.1604 (0.2079) loss: 0.7836 (0.7891) time: 0.2680 data: 0.0001 max mem: 26157 Train: [65] [3000/6250] eta: 0:14:37 lr: 0.000036 grad: 0.1744 (0.2088) loss: 0.7907 (0.7890) time: 0.2679 data: 0.0002 max mem: 26157 Train: [65] [3100/6250] eta: 0:14:10 lr: 0.000036 grad: 0.1608 (0.2086) loss: 0.7885 (0.7888) time: 0.2674 data: 0.0002 max mem: 26157 Train: [65] [3200/6250] eta: 0:13:43 lr: 0.000036 grad: 0.1661 (0.2087) loss: 0.7847 (0.7887) time: 0.2694 data: 0.0002 max mem: 26157 Train: [65] [3300/6250] eta: 0:13:16 lr: 0.000036 grad: 0.1650 (0.2089) loss: 0.7881 (0.7884) time: 0.2684 data: 0.0002 max mem: 26157 Train: [65] [3400/6250] eta: 0:12:49 lr: 0.000036 grad: 0.1536 (0.2089) loss: 0.7869 (0.7883) time: 0.2709 data: 0.0002 max mem: 26157 Train: [65] [3500/6250] eta: 0:12:22 lr: 0.000036 grad: 0.1513 (0.2109) loss: 0.7945 (0.7882) time: 0.2707 data: 0.0002 max mem: 26157 Train: [65] [3600/6250] eta: 0:11:55 lr: 0.000036 grad: 0.1769 (0.2109) loss: 0.7809 (0.7881) time: 0.2675 data: 0.0002 max mem: 26157 Train: [65] [3700/6250] eta: 0:11:28 lr: 0.000036 grad: 0.1708 (0.2116) loss: 0.7823 (0.7881) time: 0.2686 data: 0.0002 max mem: 26157 Train: [65] [3800/6250] eta: 0:11:00 lr: 0.000036 grad: 0.1604 (0.2120) loss: 0.7859 (0.7880) time: 0.2686 data: 0.0002 max mem: 26157 Train: [65] [3900/6250] eta: 0:10:33 lr: 0.000036 grad: 0.1672 (0.2128) loss: 0.7810 (0.7879) time: 0.2681 data: 0.0002 max mem: 26157 Train: [65] [4000/6250] eta: 0:10:06 lr: 0.000036 grad: 0.1688 (0.2137) loss: 0.7801 (0.7877) time: 0.2683 data: 0.0002 max mem: 26157 Train: [65] [4100/6250] eta: 0:09:39 lr: 0.000036 grad: 0.1626 (0.2129) loss: 0.7828 (0.7877) time: 0.2696 data: 0.0002 max mem: 26157 Train: [65] [4200/6250] eta: 0:09:12 lr: 0.000036 grad: 0.1560 (0.2130) loss: 0.7807 (0.7876) time: 0.2682 data: 0.0001 max mem: 26157 Train: [65] [4300/6250] eta: 0:08:45 lr: 0.000036 grad: 0.1655 (0.2126) loss: 0.7784 (0.7876) time: 0.2689 data: 0.0002 max mem: 26157 Train: [65] [4400/6250] eta: 0:08:18 lr: 0.000036 grad: 0.1564 (0.2125) loss: 0.7995 (0.7876) time: 0.2700 data: 0.0002 max mem: 26157 Train: [65] [4500/6250] eta: 0:07:51 lr: 0.000036 grad: 0.1716 (0.2131) loss: 0.7861 (0.7876) time: 0.2683 data: 0.0002 max mem: 26157 Train: [65] [4600/6250] eta: 0:07:24 lr: 0.000036 grad: 0.1585 (0.2131) loss: 0.7836 (0.7876) time: 0.2688 data: 0.0002 max mem: 26157 Train: [65] [4700/6250] eta: 0:06:57 lr: 0.000036 grad: 0.1635 (0.2128) loss: 0.7901 (0.7876) time: 0.2691 data: 0.0002 max mem: 26157 Train: [65] [4800/6250] eta: 0:06:30 lr: 0.000036 grad: 0.1681 (0.2135) loss: 0.7929 (0.7876) time: 0.2692 data: 0.0002 max mem: 26157 Train: [65] [4900/6250] eta: 0:06:03 lr: 0.000036 grad: 0.1689 (0.2135) loss: 0.7863 (0.7876) time: 0.2678 data: 0.0002 max mem: 26157 Train: [65] [5000/6250] eta: 0:05:36 lr: 0.000036 grad: 0.1671 (0.2139) loss: 0.7786 (0.7876) time: 0.2683 data: 0.0002 max mem: 26157 Train: [65] [5100/6250] eta: 0:05:09 lr: 0.000036 grad: 0.1577 (0.2152) loss: 0.7832 (0.7875) time: 0.2687 data: 0.0002 max mem: 26157 Train: [65] [5200/6250] eta: 0:04:42 lr: 0.000036 grad: 0.1709 (0.2159) loss: 0.7779 (0.7874) time: 0.2682 data: 0.0002 max mem: 26157 Train: [65] [5300/6250] eta: 0:04:16 lr: 0.000036 grad: 0.1706 (0.2163) loss: 0.7848 (0.7873) time: 0.2690 data: 0.0002 max mem: 26157 Train: [65] [5400/6250] eta: 0:03:49 lr: 0.000036 grad: 0.1815 (0.2167) loss: 0.7854 (0.7873) time: 0.2710 data: 0.0002 max mem: 26157 Train: [65] [5500/6250] eta: 0:03:22 lr: 0.000036 grad: 0.1688 (0.2180) loss: 0.7929 (0.7873) time: 0.2705 data: 0.0002 max mem: 26157 Train: [65] [5600/6250] eta: 0:02:55 lr: 0.000036 grad: 0.1743 (0.2183) loss: 0.7950 (0.7873) time: 0.2676 data: 0.0002 max mem: 26157 Train: [65] [5700/6250] eta: 0:02:28 lr: 0.000036 grad: 0.1525 (0.2191) loss: 0.7912 (0.7873) time: 0.2693 data: 0.0002 max mem: 26157 Train: [65] [5800/6250] eta: 0:02:01 lr: 0.000036 grad: 0.1844 (0.2192) loss: 0.7857 (0.7874) time: 0.2676 data: 0.0002 max mem: 26157 Train: [65] [5900/6250] eta: 0:01:34 lr: 0.000036 grad: 0.1708 (0.2200) loss: 0.7924 (0.7874) time: 0.2683 data: 0.0002 max mem: 26157 Train: [65] [6000/6250] eta: 0:01:07 lr: 0.000036 grad: 0.1657 (0.2198) loss: 0.7913 (0.7874) time: 0.2695 data: 0.0002 max mem: 26157 Train: [65] [6100/6250] eta: 0:00:40 lr: 0.000036 grad: 0.1661 (0.2203) loss: 0.7865 (0.7874) time: 0.2677 data: 0.0002 max mem: 26157 Train: [65] [6200/6250] eta: 0:00:13 lr: 0.000036 grad: 0.1640 (0.2200) loss: 0.7829 (0.7874) time: 0.2688 data: 0.0001 max mem: 26157 Train: [65] [6249/6250] eta: 0:00:00 lr: 0.000036 grad: 0.1959 (0.2200) loss: 0.7849 (0.7874) time: 0.2689 data: 0.0002 max mem: 26157 Train: [65] Total time: 0:28:09 (0.2703 s / it) Averaged stats: lr: 0.000036 grad: 0.1959 (0.2200) loss: 0.7849 (0.7874) Eval (hcp-train-subset): [65] [ 0/62] eta: 0:05:29 loss: 0.8203 (0.8203) time: 5.3127 data: 5.2298 max mem: 26157 Eval (hcp-train-subset): [65] [61/62] eta: 0:00:00 loss: 0.8065 (0.8102) time: 0.1312 data: 0.0468 max mem: 26157 Eval (hcp-train-subset): [65] Total time: 0:00:12 (0.1983 s / it) Averaged stats (hcp-train-subset): loss: 0.8065 (0.8102) Making plots (hcp-train-subset): example=47 Eval (hcp-val): [65] [ 0/62] eta: 0:05:39 loss: 0.8201 (0.8201) time: 5.4812 data: 5.3974 max mem: 26157 Eval (hcp-val): [65] [61/62] eta: 0:00:00 loss: 0.8236 (0.8248) time: 0.1093 data: 0.0244 max mem: 26157 Eval (hcp-val): [65] Total time: 0:00:12 (0.1975 s / it) Averaged stats (hcp-val): loss: 0.8236 (0.8248) Making plots (hcp-val): example=59 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [66] [ 0/6250] eta: 9:36:42 lr: 0.000036 grad: 0.2856 (0.2856) loss: 0.8051 (0.8051) time: 5.5364 data: 5.2576 max mem: 26157 Train: [66] [ 100/6250] eta: 0:33:29 lr: 0.000035 grad: 0.1727 (0.2341) loss: 0.8104 (0.8120) time: 0.2680 data: 0.0002 max mem: 26157 Train: [66] [ 200/6250] eta: 0:30:00 lr: 0.000035 grad: 0.1688 (0.2296) loss: 0.7939 (0.8038) time: 0.2674 data: 0.0002 max mem: 26157 Train: [66] [ 300/6250] eta: 0:28:32 lr: 0.000035 grad: 0.1729 (0.2353) loss: 0.7840 (0.7984) time: 0.2684 data: 0.0002 max mem: 26157 Train: [66] [ 400/6250] eta: 0:27:35 lr: 0.000035 grad: 0.1723 (0.2393) loss: 0.7918 (0.7958) time: 0.2682 data: 0.0002 max mem: 26157 Train: [66] [ 500/6250] eta: 0:26:50 lr: 0.000035 grad: 0.1952 (0.2395) loss: 0.7890 (0.7930) time: 0.2677 data: 0.0002 max mem: 26157 Train: [66] [ 600/6250] eta: 0:26:10 lr: 0.000035 grad: 0.1785 (0.2420) loss: 0.7874 (0.7915) time: 0.2682 data: 0.0002 max mem: 26157 Train: [66] [ 700/6250] eta: 0:25:36 lr: 0.000035 grad: 0.1939 (0.2380) loss: 0.7836 (0.7904) time: 0.2698 data: 0.0002 max mem: 26157 Train: [66] [ 800/6250] eta: 0:25:03 lr: 0.000035 grad: 0.1752 (0.2376) loss: 0.7909 (0.7894) time: 0.2720 data: 0.0002 max mem: 26157 Train: [66] [ 900/6250] eta: 0:24:31 lr: 0.000035 grad: 0.1581 (0.2340) loss: 0.7886 (0.7887) time: 0.2696 data: 0.0002 max mem: 26157 Train: [66] [1000/6250] eta: 0:24:00 lr: 0.000035 grad: 0.1703 (0.2358) loss: 0.7769 (0.7886) time: 0.2689 data: 0.0002 max mem: 26157 Train: [66] [1100/6250] eta: 0:23:31 lr: 0.000035 grad: 0.1752 (0.2355) loss: 0.7869 (0.7885) time: 0.2701 data: 0.0002 max mem: 26157 Train: [66] [1200/6250] eta: 0:23:01 lr: 0.000035 grad: 0.1593 (0.2328) loss: 0.7756 (0.7881) time: 0.2696 data: 0.0002 max mem: 26157 Train: [66] [1300/6250] eta: 0:22:32 lr: 0.000035 grad: 0.1638 (0.2295) loss: 0.7731 (0.7878) time: 0.2676 data: 0.0002 max mem: 26157 Train: [66] [1400/6250] eta: 0:22:03 lr: 0.000035 grad: 0.1691 (0.2263) loss: 0.7838 (0.7876) time: 0.2695 data: 0.0002 max mem: 26157 Train: [66] [1500/6250] eta: 0:21:35 lr: 0.000035 grad: 0.1687 (0.2250) loss: 0.7803 (0.7873) time: 0.2713 data: 0.0002 max mem: 26157 Train: [66] [1600/6250] eta: 0:21:07 lr: 0.000035 grad: 0.1532 (0.2242) loss: 0.7825 (0.7871) time: 0.2680 data: 0.0002 max mem: 26157 Train: [66] [1700/6250] eta: 0:20:38 lr: 0.000035 grad: 0.1681 (0.2236) loss: 0.7741 (0.7869) time: 0.2691 data: 0.0002 max mem: 26157 Train: [66] [1800/6250] eta: 0:20:10 lr: 0.000035 grad: 0.1589 (0.2241) loss: 0.7878 (0.7868) time: 0.2679 data: 0.0002 max mem: 26157 Train: [66] [1900/6250] eta: 0:19:42 lr: 0.000035 grad: 0.1671 (0.2253) loss: 0.7913 (0.7867) time: 0.2707 data: 0.0002 max mem: 26157 Train: [66] [2000/6250] eta: 0:19:15 lr: 0.000035 grad: 0.1535 (0.2249) loss: 0.7843 (0.7865) time: 0.2709 data: 0.0002 max mem: 26157 Train: [66] [2100/6250] eta: 0:18:47 lr: 0.000035 grad: 0.1702 (0.2286) loss: 0.7836 (0.7863) time: 0.2694 data: 0.0002 max mem: 26157 Train: [66] [2200/6250] eta: 0:18:19 lr: 0.000035 grad: 0.1639 (0.2281) loss: 0.7908 (0.7863) time: 0.2685 data: 0.0002 max mem: 26157 Train: [66] [2300/6250] eta: 0:17:52 lr: 0.000035 grad: 0.1734 (0.2275) loss: 0.7807 (0.7862) time: 0.2688 data: 0.0002 max mem: 26157 Train: [66] [2400/6250] eta: 0:17:24 lr: 0.000035 grad: 0.1661 (0.2285) loss: 0.7831 (0.7859) time: 0.2683 data: 0.0001 max mem: 26157 Train: [66] [2500/6250] eta: 0:16:57 lr: 0.000035 grad: 0.1579 (0.2287) loss: 0.7892 (0.7858) time: 0.2695 data: 0.0002 max mem: 26157 Train: [66] [2600/6250] eta: 0:16:29 lr: 0.000035 grad: 0.1569 (0.2276) loss: 0.7823 (0.7858) time: 0.2702 data: 0.0002 max mem: 26157 Train: [66] [2700/6250] eta: 0:16:02 lr: 0.000035 grad: 0.1578 (0.2281) loss: 0.7866 (0.7857) time: 0.2696 data: 0.0002 max mem: 26157 Train: [66] [2800/6250] eta: 0:15:35 lr: 0.000035 grad: 0.1554 (0.2280) loss: 0.7886 (0.7857) time: 0.2681 data: 0.0002 max mem: 26157 Train: [66] [2900/6250] eta: 0:15:07 lr: 0.000035 grad: 0.1573 (0.2266) loss: 0.7829 (0.7857) time: 0.2682 data: 0.0001 max mem: 26157 Train: [66] [3000/6250] eta: 0:14:40 lr: 0.000035 grad: 0.1620 (0.2258) loss: 0.7825 (0.7857) time: 0.2685 data: 0.0002 max mem: 26157 Train: [66] [3100/6250] eta: 0:14:12 lr: 0.000035 grad: 0.1562 (0.2250) loss: 0.7914 (0.7857) time: 0.2677 data: 0.0002 max mem: 26157 Train: [66] [3200/6250] eta: 0:13:45 lr: 0.000035 grad: 0.1597 (0.2284) loss: 0.7876 (0.7858) time: 0.2667 data: 0.0002 max mem: 26157 Train: [66] [3300/6250] eta: 0:13:18 lr: 0.000035 grad: 0.1543 (0.2277) loss: 0.7825 (0.7859) time: 0.2686 data: 0.0002 max mem: 26157 Train: [66] [3400/6250] eta: 0:12:51 lr: 0.000035 grad: 0.1573 (0.2292) loss: 0.7998 (0.7860) time: 0.2680 data: 0.0002 max mem: 26157 Train: [66] [3500/6250] eta: 0:12:23 lr: 0.000034 grad: 0.1525 (0.2287) loss: 0.7978 (0.7861) time: 0.2684 data: 0.0001 max mem: 26157 Train: [66] [3600/6250] eta: 0:11:56 lr: 0.000034 grad: 0.1627 (0.2282) loss: 0.7868 (0.7862) time: 0.2680 data: 0.0001 max mem: 26157 Train: [66] [3700/6250] eta: 0:11:29 lr: 0.000034 grad: 0.1592 (0.2274) loss: 0.7896 (0.7863) time: 0.2683 data: 0.0002 max mem: 26157 Train: [66] [3800/6250] eta: 0:11:02 lr: 0.000034 grad: 0.1688 (0.2271) loss: 0.7896 (0.7863) time: 0.2694 data: 0.0002 max mem: 26157 Train: [66] [3900/6250] eta: 0:10:35 lr: 0.000034 grad: 0.1765 (0.2278) loss: 0.7806 (0.7864) time: 0.2672 data: 0.0002 max mem: 26157 Train: [66] [4000/6250] eta: 0:10:08 lr: 0.000034 grad: 0.1622 (0.2292) loss: 0.7844 (0.7864) time: 0.2670 data: 0.0002 max mem: 26157 Train: [66] [4100/6250] eta: 0:09:40 lr: 0.000034 grad: 0.1632 (0.2294) loss: 0.7916 (0.7865) time: 0.2679 data: 0.0002 max mem: 26157 Train: [66] [4200/6250] eta: 0:09:13 lr: 0.000034 grad: 0.1629 (0.2292) loss: 0.7943 (0.7866) time: 0.2677 data: 0.0002 max mem: 26157 Train: [66] [4300/6250] eta: 0:08:46 lr: 0.000034 grad: 0.1552 (0.2298) loss: 0.7870 (0.7867) time: 0.2688 data: 0.0002 max mem: 26157 Train: [66] [4400/6250] eta: 0:08:19 lr: 0.000034 grad: 0.1576 (0.2297) loss: 0.7883 (0.7868) time: 0.2679 data: 0.0002 max mem: 26157 Train: [66] [4500/6250] eta: 0:07:52 lr: 0.000034 grad: 0.1636 (0.2296) loss: 0.7945 (0.7869) time: 0.2686 data: 0.0002 max mem: 26157 Train: [66] [4600/6250] eta: 0:07:25 lr: 0.000034 grad: 0.1742 (0.2289) loss: 0.7799 (0.7870) time: 0.2680 data: 0.0002 max mem: 26157 Train: [66] [4700/6250] eta: 0:06:58 lr: 0.000034 grad: 0.1731 (0.2283) loss: 0.7831 (0.7870) time: 0.2681 data: 0.0002 max mem: 26157 Train: [66] [4800/6250] eta: 0:06:31 lr: 0.000034 grad: 0.1632 (0.2293) loss: 0.7907 (0.7870) time: 0.2678 data: 0.0002 max mem: 26157 Train: [66] [4900/6250] eta: 0:06:04 lr: 0.000034 grad: 0.1566 (0.2289) loss: 0.7882 (0.7870) time: 0.2709 data: 0.0002 max mem: 26157 Train: [66] [5000/6250] eta: 0:05:37 lr: 0.000034 grad: 0.1770 (0.2296) loss: 0.7814 (0.7869) time: 0.2686 data: 0.0002 max mem: 26157 Train: [66] [5100/6250] eta: 0:05:10 lr: 0.000034 grad: 0.1640 (0.2299) loss: 0.7832 (0.7869) time: 0.2691 data: 0.0002 max mem: 26157 Train: [66] [5200/6250] eta: 0:04:43 lr: 0.000034 grad: 0.1661 (0.2295) loss: 0.7788 (0.7868) time: 0.2688 data: 0.0002 max mem: 26157 Train: [66] [5300/6250] eta: 0:04:16 lr: 0.000034 grad: 0.1621 (0.2292) loss: 0.7884 (0.7868) time: 0.2669 data: 0.0001 max mem: 26157 Train: [66] [5400/6250] eta: 0:03:49 lr: 0.000034 grad: 0.1677 (0.2292) loss: 0.7830 (0.7868) time: 0.2685 data: 0.0002 max mem: 26157 Train: [66] [5500/6250] eta: 0:03:22 lr: 0.000034 grad: 0.1686 (0.2295) loss: 0.7864 (0.7867) time: 0.2686 data: 0.0002 max mem: 26157 Train: [66] [5600/6250] eta: 0:02:55 lr: 0.000034 grad: 0.1704 (0.2297) loss: 0.7834 (0.7867) time: 0.2688 data: 0.0002 max mem: 26157 Train: [66] [5700/6250] eta: 0:02:28 lr: 0.000034 grad: 0.1652 (0.2291) loss: 0.7756 (0.7866) time: 0.2685 data: 0.0002 max mem: 26157 Train: [66] [5800/6250] eta: 0:02:01 lr: 0.000034 grad: 0.1783 (0.2292) loss: 0.7801 (0.7866) time: 0.2686 data: 0.0002 max mem: 26157 Train: [66] [5900/6250] eta: 0:01:34 lr: 0.000034 grad: 0.1550 (0.2292) loss: 0.7762 (0.7865) time: 0.2679 data: 0.0002 max mem: 26157 Train: [66] [6000/6250] eta: 0:01:07 lr: 0.000034 grad: 0.1598 (0.2289) loss: 0.7836 (0.7865) time: 0.2682 data: 0.0002 max mem: 26157 Train: [66] [6100/6250] eta: 0:00:40 lr: 0.000034 grad: 0.1657 (0.2289) loss: 0.7799 (0.7864) time: 0.2687 data: 0.0002 max mem: 26157 Train: [66] [6200/6250] eta: 0:00:13 lr: 0.000034 grad: 0.1673 (0.2288) loss: 0.7711 (0.7863) time: 0.2684 data: 0.0002 max mem: 26157 Train: [66] [6249/6250] eta: 0:00:00 lr: 0.000034 grad: 0.1642 (0.2286) loss: 0.7913 (0.7863) time: 0.2679 data: 0.0002 max mem: 26157 Train: [66] Total time: 0:28:10 (0.2705 s / it) Averaged stats: lr: 0.000034 grad: 0.1642 (0.2286) loss: 0.7913 (0.7863) Eval (hcp-train-subset): [66] [ 0/62] eta: 0:03:20 loss: 0.8197 (0.8197) time: 3.2305 data: 3.1253 max mem: 26157 Eval (hcp-train-subset): [66] [61/62] eta: 0:00:00 loss: 0.8101 (0.8119) time: 0.1094 data: 0.0266 max mem: 26157 Eval (hcp-train-subset): [66] Total time: 0:00:12 (0.1944 s / it) Averaged stats (hcp-train-subset): loss: 0.8101 (0.8119) Making plots (hcp-train-subset): example=16 Eval (hcp-val): [66] [ 0/62] eta: 0:03:42 loss: 0.8205 (0.8205) time: 3.5948 data: 3.5027 max mem: 26157 Eval (hcp-val): [66] [61/62] eta: 0:00:00 loss: 0.8243 (0.8254) time: 0.1193 data: 0.0365 max mem: 26157 Eval (hcp-val): [66] Total time: 0:00:12 (0.1984 s / it) Averaged stats (hcp-val): loss: 0.8243 (0.8254) Making plots (hcp-val): example=16 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [67] [ 0/6250] eta: 7:48:41 lr: 0.000034 grad: 0.3304 (0.3304) loss: 0.7778 (0.7778) time: 4.4995 data: 4.1973 max mem: 26157 Train: [67] [ 100/6250] eta: 0:32:33 lr: 0.000034 grad: 0.1899 (0.2479) loss: 0.8115 (0.8145) time: 0.2699 data: 0.0002 max mem: 26157 Train: [67] [ 200/6250] eta: 0:29:40 lr: 0.000034 grad: 0.1862 (0.2402) loss: 0.7933 (0.8051) time: 0.2700 data: 0.0002 max mem: 26157 Train: [67] [ 300/6250] eta: 0:28:19 lr: 0.000034 grad: 0.1761 (0.2278) loss: 0.7849 (0.7984) time: 0.2679 data: 0.0002 max mem: 26157 Train: [67] [ 400/6250] eta: 0:27:25 lr: 0.000034 grad: 0.1711 (0.2243) loss: 0.7781 (0.7957) time: 0.2684 data: 0.0002 max mem: 26157 Train: [67] [ 500/6250] eta: 0:26:42 lr: 0.000034 grad: 0.1690 (0.2281) loss: 0.7865 (0.7937) time: 0.2674 data: 0.0001 max mem: 26157 Train: [67] [ 600/6250] eta: 0:26:04 lr: 0.000033 grad: 0.2022 (0.2327) loss: 0.7793 (0.7916) time: 0.2682 data: 0.0001 max mem: 26157 Train: [67] [ 700/6250] eta: 0:25:29 lr: 0.000033 grad: 0.1653 (0.2304) loss: 0.7847 (0.7902) time: 0.2672 data: 0.0002 max mem: 26157 Train: [67] [ 800/6250] eta: 0:24:57 lr: 0.000033 grad: 0.1677 (0.2277) loss: 0.7923 (0.7897) time: 0.2686 data: 0.0002 max mem: 26157 Train: [67] [ 900/6250] eta: 0:24:26 lr: 0.000033 grad: 0.1839 (0.2293) loss: 0.7862 (0.7896) time: 0.2704 data: 0.0002 max mem: 26157 Train: [67] [1000/6250] eta: 0:23:57 lr: 0.000033 grad: 0.1570 (0.2262) loss: 0.7884 (0.7896) time: 0.2701 data: 0.0002 max mem: 26157 Train: [67] [1100/6250] eta: 0:23:27 lr: 0.000033 grad: 0.1666 (0.2305) loss: 0.7832 (0.7892) time: 0.2673 data: 0.0002 max mem: 26157 Train: [67] [1200/6250] eta: 0:22:57 lr: 0.000033 grad: 0.1712 (0.2306) loss: 0.7838 (0.7890) time: 0.2682 data: 0.0003 max mem: 26157 Train: [67] [1300/6250] eta: 0:22:28 lr: 0.000033 grad: 0.1670 (0.2279) loss: 0.7816 (0.7886) time: 0.2682 data: 0.0002 max mem: 26157 Train: [67] [1400/6250] eta: 0:21:59 lr: 0.000033 grad: 0.1669 (0.2290) loss: 0.7774 (0.7882) time: 0.2682 data: 0.0002 max mem: 26157 Train: [67] [1500/6250] eta: 0:21:31 lr: 0.000033 grad: 0.1622 (0.2268) loss: 0.7865 (0.7881) time: 0.2684 data: 0.0002 max mem: 26157 Train: [67] [1600/6250] eta: 0:21:03 lr: 0.000033 grad: 0.1606 (0.2273) loss: 0.7890 (0.7878) time: 0.2683 data: 0.0002 max mem: 26157 Train: [67] [1700/6250] eta: 0:20:35 lr: 0.000033 grad: 0.1690 (0.2250) loss: 0.7821 (0.7875) time: 0.2673 data: 0.0002 max mem: 26157 Train: [67] [1800/6250] eta: 0:20:07 lr: 0.000033 grad: 0.1572 (0.2244) loss: 0.7841 (0.7875) time: 0.2678 data: 0.0002 max mem: 26157 Train: [67] [1900/6250] eta: 0:19:39 lr: 0.000033 grad: 0.1676 (0.2228) loss: 0.7830 (0.7875) time: 0.2689 data: 0.0001 max mem: 26157 Train: [67] [2000/6250] eta: 0:19:11 lr: 0.000033 grad: 0.1715 (0.2215) loss: 0.7861 (0.7874) time: 0.2692 data: 0.0002 max mem: 26157 Train: [67] [2100/6250] eta: 0:18:44 lr: 0.000033 grad: 0.1538 (0.2220) loss: 0.7934 (0.7875) time: 0.2678 data: 0.0001 max mem: 26157 Train: [67] [2200/6250] eta: 0:18:16 lr: 0.000033 grad: 0.1879 (0.2227) loss: 0.7861 (0.7874) time: 0.2678 data: 0.0002 max mem: 26157 Train: [67] [2300/6250] eta: 0:17:48 lr: 0.000033 grad: 0.1889 (0.2235) loss: 0.7817 (0.7873) time: 0.2678 data: 0.0001 max mem: 26157 Train: [67] [2400/6250] eta: 0:17:21 lr: 0.000033 grad: 0.1572 (0.2250) loss: 0.7889 (0.7874) time: 0.2688 data: 0.0002 max mem: 26157 Train: [67] [2500/6250] eta: 0:16:54 lr: 0.000033 grad: 0.1601 (0.2258) loss: 0.7871 (0.7874) time: 0.2692 data: 0.0002 max mem: 26157 Train: [67] [2600/6250] eta: 0:16:27 lr: 0.000033 grad: 0.1573 (0.2265) loss: 0.7840 (0.7874) time: 0.2697 data: 0.0002 max mem: 26157 Train: [67] [2700/6250] eta: 0:15:59 lr: 0.000033 grad: 0.1654 (0.2281) loss: 0.7890 (0.7874) time: 0.2681 data: 0.0002 max mem: 26157 Train: [67] [2800/6250] eta: 0:15:32 lr: 0.000033 grad: 0.1563 (0.2286) loss: 0.7833 (0.7872) time: 0.2675 data: 0.0002 max mem: 26157 Train: [67] [2900/6250] eta: 0:15:05 lr: 0.000033 grad: 0.1863 (0.2296) loss: 0.7826 (0.7871) time: 0.2688 data: 0.0002 max mem: 26157 Train: [67] [3000/6250] eta: 0:14:38 lr: 0.000033 grad: 0.1796 (0.2296) loss: 0.7882 (0.7870) time: 0.2689 data: 0.0002 max mem: 26157 Train: [67] [3100/6250] eta: 0:14:10 lr: 0.000033 grad: 0.1608 (0.2286) loss: 0.7874 (0.7869) time: 0.2676 data: 0.0002 max mem: 26157 Train: [67] [3200/6250] eta: 0:13:43 lr: 0.000033 grad: 0.1631 (0.2284) loss: 0.7828 (0.7868) time: 0.2682 data: 0.0002 max mem: 26157 Train: [67] [3300/6250] eta: 0:13:16 lr: 0.000033 grad: 0.1709 (0.2283) loss: 0.7714 (0.7866) time: 0.2693 data: 0.0002 max mem: 26157 Train: [67] [3400/6250] eta: 0:12:49 lr: 0.000033 grad: 0.1856 (0.2291) loss: 0.7747 (0.7864) time: 0.2691 data: 0.0002 max mem: 26157 Train: [67] [3500/6250] eta: 0:12:22 lr: 0.000033 grad: 0.1843 (0.2285) loss: 0.7792 (0.7863) time: 0.2671 data: 0.0002 max mem: 26157 Train: [67] [3600/6250] eta: 0:11:54 lr: 0.000033 grad: 0.1708 (0.2286) loss: 0.7870 (0.7862) time: 0.2692 data: 0.0001 max mem: 26157 Train: [67] [3700/6250] eta: 0:11:27 lr: 0.000033 grad: 0.1903 (0.2315) loss: 0.7772 (0.7859) time: 0.2705 data: 0.0002 max mem: 26157 Train: [67] [3800/6250] eta: 0:11:00 lr: 0.000033 grad: 0.1855 (0.2325) loss: 0.7809 (0.7858) time: 0.2689 data: 0.0002 max mem: 26157 Train: [67] [3900/6250] eta: 0:10:33 lr: 0.000033 grad: 0.1733 (0.2325) loss: 0.7750 (0.7857) time: 0.2697 data: 0.0002 max mem: 26157 Train: [67] [4000/6250] eta: 0:10:07 lr: 0.000032 grad: 0.1809 (0.2330) loss: 0.7749 (0.7856) time: 0.2684 data: 0.0001 max mem: 26157 Train: [67] [4100/6250] eta: 0:09:40 lr: 0.000032 grad: 0.1727 (0.2344) loss: 0.7816 (0.7855) time: 0.2685 data: 0.0001 max mem: 26157 Train: [67] [4200/6250] eta: 0:09:13 lr: 0.000032 grad: 0.1649 (0.2336) loss: 0.7707 (0.7853) time: 0.2679 data: 0.0001 max mem: 26157 Train: [67] [4300/6250] eta: 0:08:46 lr: 0.000032 grad: 0.1947 (0.2352) loss: 0.7801 (0.7852) time: 0.2687 data: 0.0001 max mem: 26157 Train: [67] [4400/6250] eta: 0:08:19 lr: 0.000032 grad: 0.1852 (0.2358) loss: 0.7900 (0.7851) time: 0.2702 data: 0.0002 max mem: 26157 Train: [67] [4500/6250] eta: 0:07:52 lr: 0.000032 grad: 0.1790 (0.2367) loss: 0.7780 (0.7850) time: 0.2700 data: 0.0002 max mem: 26157 Train: [67] [4600/6250] eta: 0:07:25 lr: 0.000032 grad: 0.1746 (0.2367) loss: 0.7818 (0.7849) time: 0.2696 data: 0.0002 max mem: 26157 Train: [67] [4700/6250] eta: 0:06:58 lr: 0.000032 grad: 0.1691 (0.2367) loss: 0.7804 (0.7848) time: 0.2685 data: 0.0002 max mem: 26157 Train: [67] [4800/6250] eta: 0:06:31 lr: 0.000032 grad: 0.1679 (0.2367) loss: 0.7702 (0.7847) time: 0.2682 data: 0.0002 max mem: 26157 Train: [67] [4900/6250] eta: 0:06:04 lr: 0.000032 grad: 0.1596 (0.2365) loss: 0.7810 (0.7845) time: 0.2705 data: 0.0002 max mem: 26157 Train: [67] [5000/6250] eta: 0:05:37 lr: 0.000032 grad: 0.1760 (0.2360) loss: 0.7715 (0.7844) time: 0.2701 data: 0.0002 max mem: 26157 Train: [67] [5100/6250] eta: 0:05:10 lr: 0.000032 grad: 0.2012 (0.2364) loss: 0.7669 (0.7842) time: 0.2683 data: 0.0002 max mem: 26157 Train: [67] [5200/6250] eta: 0:04:43 lr: 0.000032 grad: 0.1872 (0.2367) loss: 0.7775 (0.7840) time: 0.2691 data: 0.0002 max mem: 26157 Train: [67] [5300/6250] eta: 0:04:16 lr: 0.000032 grad: 0.1687 (0.2383) loss: 0.7801 (0.7839) time: 0.2686 data: 0.0002 max mem: 26157 Train: [67] [5400/6250] eta: 0:03:49 lr: 0.000032 grad: 0.1844 (0.2391) loss: 0.7750 (0.7838) time: 0.2705 data: 0.0002 max mem: 26157 Train: [67] [5500/6250] eta: 0:03:22 lr: 0.000032 grad: 0.1780 (0.2390) loss: 0.7810 (0.7837) time: 0.2692 data: 0.0001 max mem: 26157 Train: [67] [5600/6250] eta: 0:02:55 lr: 0.000032 grad: 0.1748 (0.2387) loss: 0.7815 (0.7836) time: 0.2681 data: 0.0002 max mem: 26157 Train: [67] [5700/6250] eta: 0:02:28 lr: 0.000032 grad: 0.1701 (0.2384) loss: 0.7750 (0.7836) time: 0.2701 data: 0.0002 max mem: 26157 Train: [67] [5800/6250] eta: 0:02:01 lr: 0.000032 grad: 0.1779 (0.2384) loss: 0.7802 (0.7835) time: 0.2688 data: 0.0002 max mem: 26157 Train: [67] [5900/6250] eta: 0:01:34 lr: 0.000032 grad: 0.1780 (0.2401) loss: 0.7750 (0.7834) time: 0.2687 data: 0.0002 max mem: 26157 Train: [67] [6000/6250] eta: 0:01:07 lr: 0.000032 grad: 0.1647 (0.2415) loss: 0.7880 (0.7834) time: 0.2686 data: 0.0002 max mem: 26157 Train: [67] [6100/6250] eta: 0:00:40 lr: 0.000032 grad: 0.1843 (0.2417) loss: 0.7789 (0.7833) time: 0.2696 data: 0.0002 max mem: 26157 Train: [67] [6200/6250] eta: 0:00:13 lr: 0.000032 grad: 0.1647 (0.2414) loss: 0.7793 (0.7832) time: 0.2687 data: 0.0002 max mem: 26157 Train: [67] [6249/6250] eta: 0:00:00 lr: 0.000032 grad: 0.1602 (0.2412) loss: 0.7773 (0.7832) time: 0.2700 data: 0.0002 max mem: 26157 Train: [67] Total time: 0:28:09 (0.2704 s / it) Averaged stats: lr: 0.000032 grad: 0.1602 (0.2412) loss: 0.7773 (0.7832) Eval (hcp-train-subset): [67] [ 0/62] eta: 0:04:36 loss: 0.8169 (0.8169) time: 4.4562 data: 4.3707 max mem: 26157 Eval (hcp-train-subset): [67] [61/62] eta: 0:00:00 loss: 0.8100 (0.8095) time: 0.1173 data: 0.0347 max mem: 26157 Eval (hcp-train-subset): [67] Total time: 0:00:12 (0.1993 s / it) Averaged stats (hcp-train-subset): loss: 0.8100 (0.8095) Making plots (hcp-train-subset): example=10 Eval (hcp-val): [67] [ 0/62] eta: 0:05:21 loss: 0.8176 (0.8176) time: 5.1829 data: 5.0960 max mem: 26157 Eval (hcp-val): [67] [61/62] eta: 0:00:00 loss: 0.8232 (0.8244) time: 0.1272 data: 0.0444 max mem: 26157 Eval (hcp-val): [67] Total time: 0:00:12 (0.2064 s / it) Averaged stats (hcp-val): loss: 0.8232 (0.8244) Making plots (hcp-val): example=7 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [68] [ 0/6250] eta: 10:55:54 lr: 0.000032 grad: 0.3396 (0.3396) loss: 0.7598 (0.7598) time: 6.2967 data: 6.0246 max mem: 26157 Train: [68] [ 100/6250] eta: 0:33:54 lr: 0.000032 grad: 0.1740 (0.2244) loss: 0.7901 (0.8028) time: 0.2715 data: 0.0002 max mem: 26157 Train: [68] [ 200/6250] eta: 0:30:13 lr: 0.000032 grad: 0.1676 (0.2262) loss: 0.7907 (0.7965) time: 0.2690 data: 0.0002 max mem: 26157 Train: [68] [ 300/6250] eta: 0:28:40 lr: 0.000032 grad: 0.1779 (0.2285) loss: 0.7950 (0.7942) time: 0.2679 data: 0.0002 max mem: 26157 Train: [68] [ 400/6250] eta: 0:27:40 lr: 0.000032 grad: 0.1677 (0.2209) loss: 0.7851 (0.7934) time: 0.2677 data: 0.0001 max mem: 26157 Train: [68] [ 500/6250] eta: 0:26:54 lr: 0.000032 grad: 0.1562 (0.2138) loss: 0.7928 (0.7933) time: 0.2680 data: 0.0002 max mem: 26157 Train: [68] [ 600/6250] eta: 0:26:14 lr: 0.000032 grad: 0.1563 (0.2124) loss: 0.8043 (0.7930) time: 0.2684 data: 0.0002 max mem: 26157 Train: [68] [ 700/6250] eta: 0:25:38 lr: 0.000032 grad: 0.1539 (0.2193) loss: 0.7971 (0.7933) time: 0.2686 data: 0.0002 max mem: 26157 Train: [68] [ 800/6250] eta: 0:25:05 lr: 0.000032 grad: 0.1532 (0.2201) loss: 0.7984 (0.7936) time: 0.2692 data: 0.0002 max mem: 26157 Train: [68] [ 900/6250] eta: 0:24:33 lr: 0.000032 grad: 0.1540 (0.2142) loss: 0.7936 (0.7938) time: 0.2679 data: 0.0002 max mem: 26157 Train: [68] [1000/6250] eta: 0:24:02 lr: 0.000032 grad: 0.1775 (0.2137) loss: 0.7895 (0.7938) time: 0.2682 data: 0.0002 max mem: 26157 Train: [68] [1100/6250] eta: 0:23:32 lr: 0.000032 grad: 0.1550 (0.2130) loss: 0.7910 (0.7936) time: 0.2693 data: 0.0002 max mem: 26157 Train: [68] [1200/6250] eta: 0:23:02 lr: 0.000032 grad: 0.1657 (0.2152) loss: 0.7890 (0.7934) time: 0.2693 data: 0.0002 max mem: 26157 Train: [68] [1300/6250] eta: 0:22:33 lr: 0.000031 grad: 0.1608 (0.2150) loss: 0.7949 (0.7933) time: 0.2684 data: 0.0002 max mem: 26157 Train: [68] [1400/6250] eta: 0:22:04 lr: 0.000031 grad: 0.1929 (0.2176) loss: 0.7877 (0.7930) time: 0.2675 data: 0.0001 max mem: 26157 Train: [68] [1500/6250] eta: 0:21:35 lr: 0.000031 grad: 0.1687 (0.2162) loss: 0.7842 (0.7927) time: 0.2686 data: 0.0002 max mem: 26157 Train: [68] [1600/6250] eta: 0:21:07 lr: 0.000031 grad: 0.1738 (0.2196) loss: 0.7823 (0.7923) time: 0.2672 data: 0.0001 max mem: 26157 Train: [68] [1700/6250] eta: 0:20:39 lr: 0.000031 grad: 0.1598 (0.2199) loss: 0.7841 (0.7920) time: 0.2706 data: 0.0002 max mem: 26157 Train: [68] [1800/6250] eta: 0:20:10 lr: 0.000031 grad: 0.1711 (0.2204) loss: 0.7825 (0.7916) time: 0.2678 data: 0.0002 max mem: 26157 Train: [68] [1900/6250] eta: 0:19:42 lr: 0.000031 grad: 0.1605 (0.2224) loss: 0.7909 (0.7914) time: 0.2690 data: 0.0002 max mem: 26157 Train: [68] [2000/6250] eta: 0:19:14 lr: 0.000031 grad: 0.1778 (0.2236) loss: 0.7933 (0.7911) time: 0.2671 data: 0.0002 max mem: 26157 Train: [68] [2100/6250] eta: 0:18:46 lr: 0.000031 grad: 0.1859 (0.2281) loss: 0.7769 (0.7908) time: 0.2686 data: 0.0001 max mem: 26157 Train: [68] [2200/6250] eta: 0:18:19 lr: 0.000031 grad: 0.1608 (0.2281) loss: 0.7877 (0.7906) time: 0.2671 data: 0.0002 max mem: 26157 Train: [68] [2300/6250] eta: 0:17:51 lr: 0.000031 grad: 0.1608 (0.2295) loss: 0.7881 (0.7903) time: 0.2703 data: 0.0002 max mem: 26157 Train: [68] [2400/6250] eta: 0:17:23 lr: 0.000031 grad: 0.1672 (0.2298) loss: 0.7862 (0.7900) time: 0.2672 data: 0.0002 max mem: 26157 Train: [68] [2500/6250] eta: 0:16:56 lr: 0.000031 grad: 0.1654 (0.2281) loss: 0.7834 (0.7897) time: 0.2683 data: 0.0002 max mem: 26157 Train: [68] [2600/6250] eta: 0:16:28 lr: 0.000031 grad: 0.1778 (0.2296) loss: 0.7807 (0.7893) time: 0.2707 data: 0.0002 max mem: 26157 Train: [68] [2700/6250] eta: 0:16:01 lr: 0.000031 grad: 0.1788 (0.2300) loss: 0.7851 (0.7891) time: 0.2697 data: 0.0002 max mem: 26157 Train: [68] [2800/6250] eta: 0:15:34 lr: 0.000031 grad: 0.1684 (0.2323) loss: 0.7819 (0.7889) time: 0.2681 data: 0.0002 max mem: 26157 Train: [68] [2900/6250] eta: 0:15:06 lr: 0.000031 grad: 0.1927 (0.2336) loss: 0.7806 (0.7886) time: 0.2680 data: 0.0001 max mem: 26157 Train: [68] [3000/6250] eta: 0:14:39 lr: 0.000031 grad: 0.1673 (0.2340) loss: 0.7808 (0.7882) time: 0.2681 data: 0.0002 max mem: 26157 Train: [68] [3100/6250] eta: 0:14:12 lr: 0.000031 grad: 0.1759 (0.2351) loss: 0.7783 (0.7877) time: 0.2693 data: 0.0002 max mem: 26157 Train: [68] [3200/6250] eta: 0:13:44 lr: 0.000031 grad: 0.1767 (0.2353) loss: 0.7729 (0.7874) time: 0.2678 data: 0.0001 max mem: 26157 Train: [68] [3300/6250] eta: 0:13:17 lr: 0.000031 grad: 0.1703 (0.2354) loss: 0.7835 (0.7873) time: 0.2682 data: 0.0002 max mem: 26157 Train: [68] [3400/6250] eta: 0:12:50 lr: 0.000031 grad: 0.1768 (0.2351) loss: 0.7740 (0.7871) time: 0.2682 data: 0.0001 max mem: 26157 Train: [68] [3500/6250] eta: 0:12:23 lr: 0.000031 grad: 0.1598 (0.2353) loss: 0.7850 (0.7869) time: 0.2685 data: 0.0001 max mem: 26157 Train: [68] [3600/6250] eta: 0:11:56 lr: 0.000031 grad: 0.1780 (0.2357) loss: 0.7737 (0.7868) time: 0.2686 data: 0.0002 max mem: 26157 Train: [68] [3700/6250] eta: 0:11:29 lr: 0.000031 grad: 0.1755 (0.2364) loss: 0.7902 (0.7867) time: 0.2687 data: 0.0002 max mem: 26157 Train: [68] [3800/6250] eta: 0:11:01 lr: 0.000031 grad: 0.1661 (0.2366) loss: 0.7823 (0.7865) time: 0.2677 data: 0.0002 max mem: 26157 Train: [68] [3900/6250] eta: 0:10:34 lr: 0.000031 grad: 0.1742 (0.2385) loss: 0.7809 (0.7863) time: 0.2682 data: 0.0002 max mem: 26157 Train: [68] [4000/6250] eta: 0:10:07 lr: 0.000031 grad: 0.1744 (0.2376) loss: 0.7783 (0.7862) time: 0.2678 data: 0.0002 max mem: 26157 Train: [68] [4100/6250] eta: 0:09:40 lr: 0.000031 grad: 0.1915 (0.2374) loss: 0.7834 (0.7861) time: 0.2695 data: 0.0002 max mem: 26157 Train: [68] [4200/6250] eta: 0:09:13 lr: 0.000031 grad: 0.1873 (0.2367) loss: 0.7755 (0.7859) time: 0.2680 data: 0.0002 max mem: 26157 Train: [68] [4300/6250] eta: 0:08:46 lr: 0.000031 grad: 0.1615 (0.2359) loss: 0.7805 (0.7858) time: 0.2689 data: 0.0002 max mem: 26157 Train: [68] [4400/6250] eta: 0:08:19 lr: 0.000031 grad: 0.1661 (0.2363) loss: 0.7837 (0.7857) time: 0.2677 data: 0.0002 max mem: 26157 Train: [68] [4500/6250] eta: 0:07:52 lr: 0.000031 grad: 0.1700 (0.2365) loss: 0.7779 (0.7856) time: 0.2678 data: 0.0001 max mem: 26157 Train: [68] [4600/6250] eta: 0:07:25 lr: 0.000031 grad: 0.1620 (0.2364) loss: 0.7714 (0.7854) time: 0.2683 data: 0.0002 max mem: 26157 Train: [68] [4700/6250] eta: 0:06:58 lr: 0.000031 grad: 0.1746 (0.2366) loss: 0.7758 (0.7853) time: 0.2678 data: 0.0001 max mem: 26157 Train: [68] [4800/6250] eta: 0:06:31 lr: 0.000030 grad: 0.1773 (0.2364) loss: 0.7715 (0.7851) time: 0.2678 data: 0.0002 max mem: 26157 Train: [68] [4900/6250] eta: 0:06:04 lr: 0.000030 grad: 0.1727 (0.2365) loss: 0.7784 (0.7850) time: 0.2677 data: 0.0002 max mem: 26157 Train: [68] [5000/6250] eta: 0:05:37 lr: 0.000030 grad: 0.1814 (0.2374) loss: 0.7836 (0.7849) time: 0.2687 data: 0.0002 max mem: 26157 Train: [68] [5100/6250] eta: 0:05:10 lr: 0.000030 grad: 0.1744 (0.2394) loss: 0.7730 (0.7847) time: 0.2689 data: 0.0002 max mem: 26157 Train: [68] [5200/6250] eta: 0:04:43 lr: 0.000030 grad: 0.1760 (0.2399) loss: 0.7661 (0.7846) time: 0.2684 data: 0.0002 max mem: 26157 Train: [68] [5300/6250] eta: 0:04:16 lr: 0.000030 grad: 0.1651 (0.2405) loss: 0.7751 (0.7844) time: 0.2690 data: 0.0002 max mem: 26157 Train: [68] [5400/6250] eta: 0:03:49 lr: 0.000030 grad: 0.1702 (0.2406) loss: 0.7738 (0.7842) time: 0.2680 data: 0.0002 max mem: 26157 Train: [68] [5500/6250] eta: 0:03:22 lr: 0.000030 grad: 0.1674 (0.2401) loss: 0.7754 (0.7841) time: 0.2678 data: 0.0002 max mem: 26157 Train: [68] [5600/6250] eta: 0:02:55 lr: 0.000030 grad: 0.1661 (0.2418) loss: 0.7723 (0.7839) time: 0.2679 data: 0.0001 max mem: 26157 Train: [68] [5700/6250] eta: 0:02:28 lr: 0.000030 grad: 0.1716 (0.2424) loss: 0.7792 (0.7838) time: 0.2671 data: 0.0001 max mem: 26157 Train: [68] [5800/6250] eta: 0:02:01 lr: 0.000030 grad: 0.1787 (0.2429) loss: 0.7841 (0.7837) time: 0.2675 data: 0.0001 max mem: 26157 Train: [68] [5900/6250] eta: 0:01:34 lr: 0.000030 grad: 0.1677 (0.2432) loss: 0.7796 (0.7836) time: 0.2672 data: 0.0002 max mem: 26157 Train: [68] [6000/6250] eta: 0:01:07 lr: 0.000030 grad: 0.1698 (0.2425) loss: 0.7814 (0.7835) time: 0.2684 data: 0.0002 max mem: 26157 Train: [68] [6100/6250] eta: 0:00:40 lr: 0.000030 grad: 0.1653 (0.2422) loss: 0.7811 (0.7835) time: 0.2671 data: 0.0001 max mem: 26157 Train: [68] [6200/6250] eta: 0:00:13 lr: 0.000030 grad: 0.1660 (0.2421) loss: 0.7816 (0.7834) time: 0.2677 data: 0.0002 max mem: 26157 Train: [68] [6249/6250] eta: 0:00:00 lr: 0.000030 grad: 0.1685 (0.2423) loss: 0.7786 (0.7834) time: 0.2667 data: 0.0001 max mem: 26157 Train: [68] Total time: 0:28:09 (0.2704 s / it) Averaged stats: lr: 0.000030 grad: 0.1685 (0.2423) loss: 0.7786 (0.7834) Eval (hcp-train-subset): [68] [ 0/62] eta: 0:05:07 loss: 0.8197 (0.8197) time: 4.9578 data: 4.8745 max mem: 26157 Eval (hcp-train-subset): [68] [61/62] eta: 0:00:00 loss: 0.8043 (0.8069) time: 0.0879 data: 0.0055 max mem: 26157 Eval (hcp-train-subset): [68] Total time: 0:00:11 (0.1821 s / it) Averaged stats (hcp-train-subset): loss: 0.8043 (0.8069) Making plots (hcp-train-subset): example=30 Eval (hcp-val): [68] [ 0/62] eta: 0:03:19 loss: 0.8240 (0.8240) time: 3.2128 data: 3.1080 max mem: 26157 Eval (hcp-val): [68] [61/62] eta: 0:00:00 loss: 0.8231 (0.8246) time: 0.1029 data: 0.0188 max mem: 26157 Eval (hcp-val): [68] Total time: 0:00:11 (0.1899 s / it) Averaged stats (hcp-val): loss: 0.8231 (0.8246) Making plots (hcp-val): example=56 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [69] [ 0/6250] eta: 9:44:32 lr: 0.000030 grad: 0.2337 (0.2337) loss: 0.8156 (0.8156) time: 5.6117 data: 5.3311 max mem: 26157 Train: [69] [ 100/6250] eta: 0:33:01 lr: 0.000030 grad: 0.1578 (0.2654) loss: 0.8021 (0.8021) time: 0.2672 data: 0.0002 max mem: 26157 Train: [69] [ 200/6250] eta: 0:29:49 lr: 0.000030 grad: 0.1773 (0.2423) loss: 0.7876 (0.7974) time: 0.2697 data: 0.0001 max mem: 26157 Train: [69] [ 300/6250] eta: 0:28:24 lr: 0.000030 grad: 0.1868 (0.2464) loss: 0.7803 (0.7929) time: 0.2677 data: 0.0001 max mem: 26157 Train: [69] [ 400/6250] eta: 0:27:28 lr: 0.000030 grad: 0.1988 (0.2562) loss: 0.7794 (0.7894) time: 0.2700 data: 0.0002 max mem: 26157 Train: [69] [ 500/6250] eta: 0:26:44 lr: 0.000030 grad: 0.1851 (0.2548) loss: 0.7796 (0.7878) time: 0.2682 data: 0.0002 max mem: 26157 Train: [69] [ 600/6250] eta: 0:26:06 lr: 0.000030 grad: 0.1715 (0.2624) loss: 0.7860 (0.7864) time: 0.2677 data: 0.0001 max mem: 26157 Train: [69] [ 700/6250] eta: 0:25:31 lr: 0.000030 grad: 0.1819 (0.2740) loss: 0.7772 (0.7854) time: 0.2678 data: 0.0001 max mem: 26157 Train: [69] [ 800/6250] eta: 0:24:58 lr: 0.000030 grad: 0.1837 (0.2772) loss: 0.7815 (0.7850) time: 0.2681 data: 0.0001 max mem: 26157 Train: [69] [ 900/6250] eta: 0:24:26 lr: 0.000030 grad: 0.1633 (0.2706) loss: 0.7815 (0.7842) time: 0.2684 data: 0.0002 max mem: 26157 Train: [69] [1000/6250] eta: 0:23:55 lr: 0.000030 grad: 0.1852 (0.2701) loss: 0.7786 (0.7838) time: 0.2683 data: 0.0001 max mem: 26157 Train: [69] [1100/6250] eta: 0:23:25 lr: 0.000030 grad: 0.1646 (0.2640) loss: 0.7770 (0.7832) time: 0.2668 data: 0.0001 max mem: 26157 Train: [69] [1200/6250] eta: 0:22:56 lr: 0.000030 grad: 0.1701 (0.2651) loss: 0.7824 (0.7825) time: 0.2675 data: 0.0001 max mem: 26157 Train: [69] [1300/6250] eta: 0:22:26 lr: 0.000030 grad: 0.1714 (0.2650) loss: 0.7803 (0.7823) time: 0.2679 data: 0.0002 max mem: 26157 Train: [69] [1400/6250] eta: 0:21:57 lr: 0.000030 grad: 0.1639 (0.2613) loss: 0.7729 (0.7818) time: 0.2678 data: 0.0001 max mem: 26157 Train: [69] [1500/6250] eta: 0:21:29 lr: 0.000030 grad: 0.1663 (0.2602) loss: 0.7896 (0.7815) time: 0.2684 data: 0.0001 max mem: 26157 Train: [69] [1600/6250] eta: 0:21:01 lr: 0.000030 grad: 0.1600 (0.2648) loss: 0.7726 (0.7810) time: 0.2689 data: 0.0002 max mem: 26157 Train: [69] [1700/6250] eta: 0:20:33 lr: 0.000030 grad: 0.1891 (0.2655) loss: 0.7729 (0.7808) time: 0.2684 data: 0.0002 max mem: 26157 Train: [69] [1800/6250] eta: 0:20:05 lr: 0.000030 grad: 0.1689 (0.2654) loss: 0.7811 (0.7805) time: 0.2673 data: 0.0002 max mem: 26157 Train: [69] [1900/6250] eta: 0:19:37 lr: 0.000030 grad: 0.1803 (0.2693) loss: 0.7791 (0.7804) time: 0.2681 data: 0.0001 max mem: 26157 Train: [69] [2000/6250] eta: 0:19:09 lr: 0.000030 grad: 0.1782 (0.2714) loss: 0.7813 (0.7803) time: 0.2673 data: 0.0002 max mem: 26157 Train: [69] [2100/6250] eta: 0:18:42 lr: 0.000029 grad: 0.1630 (0.2698) loss: 0.7811 (0.7803) time: 0.2674 data: 0.0002 max mem: 26157 Train: [69] [2200/6250] eta: 0:18:14 lr: 0.000029 grad: 0.1862 (0.2696) loss: 0.7865 (0.7803) time: 0.2674 data: 0.0001 max mem: 26157 Train: [69] [2300/6250] eta: 0:17:47 lr: 0.000029 grad: 0.1701 (0.2689) loss: 0.7787 (0.7803) time: 0.2691 data: 0.0002 max mem: 26157 Train: [69] [2400/6250] eta: 0:17:19 lr: 0.000029 grad: 0.1722 (0.2683) loss: 0.7822 (0.7803) time: 0.2682 data: 0.0002 max mem: 26157 Train: [69] [2500/6250] eta: 0:16:52 lr: 0.000029 grad: 0.1637 (0.2685) loss: 0.7916 (0.7805) time: 0.2684 data: 0.0001 max mem: 26157 Train: [69] [2600/6250] eta: 0:16:25 lr: 0.000029 grad: 0.1604 (0.2677) loss: 0.7879 (0.7806) time: 0.2674 data: 0.0001 max mem: 26157 Train: [69] [2700/6250] eta: 0:15:57 lr: 0.000029 grad: 0.1774 (0.2669) loss: 0.7819 (0.7808) time: 0.2672 data: 0.0001 max mem: 26157 Train: [69] [2800/6250] eta: 0:15:30 lr: 0.000029 grad: 0.1848 (0.2657) loss: 0.7805 (0.7809) time: 0.2667 data: 0.0001 max mem: 26157 Train: [69] [2900/6250] eta: 0:15:03 lr: 0.000029 grad: 0.1607 (0.2663) loss: 0.7905 (0.7811) time: 0.2684 data: 0.0002 max mem: 26157 Train: [69] [3000/6250] eta: 0:14:36 lr: 0.000029 grad: 0.1685 (0.2660) loss: 0.7839 (0.7812) time: 0.2689 data: 0.0002 max mem: 26157 Train: [69] [3100/6250] eta: 0:14:08 lr: 0.000029 grad: 0.1713 (0.2646) loss: 0.7886 (0.7813) time: 0.2683 data: 0.0002 max mem: 26157 Train: [69] [3200/6250] eta: 0:13:41 lr: 0.000029 grad: 0.1740 (0.2631) loss: 0.7828 (0.7814) time: 0.2680 data: 0.0001 max mem: 26157 Train: [69] [3300/6250] eta: 0:13:14 lr: 0.000029 grad: 0.1909 (0.2648) loss: 0.7778 (0.7814) time: 0.2674 data: 0.0002 max mem: 26157 Train: [69] [3400/6250] eta: 0:12:47 lr: 0.000029 grad: 0.1763 (0.2643) loss: 0.7803 (0.7815) time: 0.2681 data: 0.0001 max mem: 26157 Train: [69] [3500/6250] eta: 0:12:20 lr: 0.000029 grad: 0.1637 (0.2633) loss: 0.7826 (0.7815) time: 0.2671 data: 0.0001 max mem: 26157 Train: [69] [3600/6250] eta: 0:11:53 lr: 0.000029 grad: 0.1744 (0.2636) loss: 0.7849 (0.7816) time: 0.2669 data: 0.0001 max mem: 26157 Train: [69] [3700/6250] eta: 0:11:26 lr: 0.000029 grad: 0.1703 (0.2624) loss: 0.7853 (0.7817) time: 0.2680 data: 0.0002 max mem: 26157 Train: [69] [3800/6250] eta: 0:10:59 lr: 0.000029 grad: 0.1719 (0.2641) loss: 0.7780 (0.7817) time: 0.2673 data: 0.0002 max mem: 26157 Train: [69] [3900/6250] eta: 0:10:32 lr: 0.000029 grad: 0.1754 (0.2653) loss: 0.7763 (0.7817) time: 0.2694 data: 0.0002 max mem: 26157 Train: [69] [4000/6250] eta: 0:10:05 lr: 0.000029 grad: 0.1747 (0.2659) loss: 0.7825 (0.7817) time: 0.2684 data: 0.0001 max mem: 26157 Train: [69] [4100/6250] eta: 0:09:38 lr: 0.000029 grad: 0.1830 (0.2674) loss: 0.7850 (0.7817) time: 0.2683 data: 0.0001 max mem: 26157 Train: [69] [4200/6250] eta: 0:09:11 lr: 0.000029 grad: 0.1673 (0.2669) loss: 0.7824 (0.7816) time: 0.2668 data: 0.0001 max mem: 26157 Train: [69] [4300/6250] eta: 0:08:44 lr: 0.000029 grad: 0.1796 (0.2677) loss: 0.7792 (0.7816) time: 0.2675 data: 0.0002 max mem: 26157 Train: [69] [4400/6250] eta: 0:08:17 lr: 0.000029 grad: 0.1664 (0.2678) loss: 0.7809 (0.7816) time: 0.2685 data: 0.0001 max mem: 26157 Train: [69] [4500/6250] eta: 0:07:50 lr: 0.000029 grad: 0.1882 (0.2689) loss: 0.7841 (0.7816) time: 0.2678 data: 0.0001 max mem: 26157 Train: [69] [4600/6250] eta: 0:07:23 lr: 0.000029 grad: 0.1792 (0.2682) loss: 0.7821 (0.7815) time: 0.2678 data: 0.0002 max mem: 26157 Train: [69] [4700/6250] eta: 0:06:56 lr: 0.000029 grad: 0.1866 (0.2680) loss: 0.7699 (0.7815) time: 0.2670 data: 0.0001 max mem: 26157 Train: [69] [4800/6250] eta: 0:06:29 lr: 0.000029 grad: 0.1803 (0.2685) loss: 0.7813 (0.7814) time: 0.2673 data: 0.0001 max mem: 26157 Train: [69] [4900/6250] eta: 0:06:02 lr: 0.000029 grad: 0.1817 (0.2684) loss: 0.7700 (0.7813) time: 0.2691 data: 0.0001 max mem: 26157 Train: [69] [5000/6250] eta: 0:05:36 lr: 0.000029 grad: 0.1778 (0.2696) loss: 0.7776 (0.7812) time: 0.2678 data: 0.0001 max mem: 26157 Train: [69] [5100/6250] eta: 0:05:09 lr: 0.000029 grad: 0.1804 (0.2699) loss: 0.7778 (0.7812) time: 0.2678 data: 0.0001 max mem: 26157 Train: [69] [5200/6250] eta: 0:04:42 lr: 0.000029 grad: 0.2092 (0.2703) loss: 0.7813 (0.7811) time: 0.2688 data: 0.0002 max mem: 26157 Train: [69] [5300/6250] eta: 0:04:15 lr: 0.000029 grad: 0.1933 (0.2711) loss: 0.7798 (0.7810) time: 0.2702 data: 0.0002 max mem: 26157 Train: [69] [5400/6250] eta: 0:03:48 lr: 0.000029 grad: 0.1750 (0.2711) loss: 0.7843 (0.7810) time: 0.2673 data: 0.0002 max mem: 26157 Train: [69] [5500/6250] eta: 0:03:21 lr: 0.000029 grad: 0.1775 (0.2712) loss: 0.7713 (0.7809) time: 0.2676 data: 0.0002 max mem: 26157 Train: [69] [5600/6250] eta: 0:02:54 lr: 0.000028 grad: 0.1849 (0.2710) loss: 0.7782 (0.7808) time: 0.2671 data: 0.0001 max mem: 26157 Train: [69] [5700/6250] eta: 0:02:27 lr: 0.000028 grad: 0.1737 (0.2708) loss: 0.7773 (0.7808) time: 0.2672 data: 0.0001 max mem: 26157 Train: [69] [5800/6250] eta: 0:02:00 lr: 0.000028 grad: 0.1852 (0.2706) loss: 0.7831 (0.7807) time: 0.2671 data: 0.0001 max mem: 26157 Train: [69] [5900/6250] eta: 0:01:34 lr: 0.000028 grad: 0.1729 (0.2706) loss: 0.7702 (0.7806) time: 0.2671 data: 0.0002 max mem: 26157 Train: [69] [6000/6250] eta: 0:01:07 lr: 0.000028 grad: 0.1848 (0.2700) loss: 0.7802 (0.7806) time: 0.2665 data: 0.0001 max mem: 26157 Train: [69] [6100/6250] eta: 0:00:40 lr: 0.000028 grad: 0.1646 (0.2695) loss: 0.7848 (0.7806) time: 0.2668 data: 0.0002 max mem: 26157 Train: [69] [6200/6250] eta: 0:00:13 lr: 0.000028 grad: 0.1748 (0.2693) loss: 0.7841 (0.7806) time: 0.2670 data: 0.0001 max mem: 26157 Train: [69] [6249/6250] eta: 0:00:00 lr: 0.000028 grad: 0.1737 (0.2693) loss: 0.7724 (0.7806) time: 0.2678 data: 0.0001 max mem: 26157 Train: [69] Total time: 0:28:02 (0.2692 s / it) Averaged stats: lr: 0.000028 grad: 0.1737 (0.2693) loss: 0.7724 (0.7806) Eval (hcp-train-subset): [69] [ 0/62] eta: 0:04:21 loss: 0.8197 (0.8197) time: 4.2228 data: 4.1394 max mem: 26157 Eval (hcp-train-subset): [69] [61/62] eta: 0:00:00 loss: 0.8066 (0.8079) time: 0.1187 data: 0.0361 max mem: 26157 Eval (hcp-train-subset): [69] Total time: 0:00:11 (0.1873 s / it) Averaged stats (hcp-train-subset): loss: 0.8066 (0.8079) Making plots (hcp-train-subset): example=4 Eval (hcp-val): [69] [ 0/62] eta: 0:05:13 loss: 0.8201 (0.8201) time: 5.0606 data: 4.9764 max mem: 26157 Eval (hcp-val): [69] [61/62] eta: 0:00:00 loss: 0.8229 (0.8243) time: 0.1256 data: 0.0411 max mem: 26157 Eval (hcp-val): [69] Total time: 0:00:11 (0.1911 s / it) Averaged stats (hcp-val): loss: 0.8229 (0.8243) Making plots (hcp-val): example=45 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [70] [ 0/6250] eta: 6:59:29 lr: 0.000028 grad: 0.6485 (0.6485) loss: 0.8006 (0.8006) time: 4.0271 data: 3.6870 max mem: 26157 Train: [70] [ 100/6250] eta: 0:32:49 lr: 0.000028 grad: 0.2276 (0.2825) loss: 0.7975 (0.8021) time: 0.2669 data: 0.0001 max mem: 26157 Train: [70] [ 200/6250] eta: 0:29:39 lr: 0.000028 grad: 0.2444 (0.2855) loss: 0.7643 (0.7899) time: 0.2677 data: 0.0002 max mem: 26157 Train: [70] [ 300/6250] eta: 0:28:17 lr: 0.000028 grad: 0.2015 (0.2841) loss: 0.7881 (0.7864) time: 0.2667 data: 0.0002 max mem: 26157 Train: [70] [ 400/6250] eta: 0:27:22 lr: 0.000028 grad: 0.1973 (0.2854) loss: 0.7850 (0.7849) time: 0.2673 data: 0.0002 max mem: 26157 Train: [70] [ 500/6250] eta: 0:26:39 lr: 0.000028 grad: 0.1744 (0.2872) loss: 0.7839 (0.7849) time: 0.2675 data: 0.0002 max mem: 26157 Train: [70] [ 600/6250] eta: 0:26:01 lr: 0.000028 grad: 0.1692 (0.2796) loss: 0.7886 (0.7850) time: 0.2675 data: 0.0001 max mem: 26157 Train: [70] [ 700/6250] eta: 0:25:27 lr: 0.000028 grad: 0.1783 (0.2766) loss: 0.7943 (0.7851) time: 0.2686 data: 0.0001 max mem: 26157 Train: [70] [ 800/6250] eta: 0:24:55 lr: 0.000028 grad: 0.1742 (0.2670) loss: 0.7859 (0.7851) time: 0.2703 data: 0.0002 max mem: 26157 Train: [70] [ 900/6250] eta: 0:24:24 lr: 0.000028 grad: 0.1664 (0.2633) loss: 0.7886 (0.7850) time: 0.2677 data: 0.0002 max mem: 26157 Train: [70] [1000/6250] eta: 0:23:54 lr: 0.000028 grad: 0.1691 (0.2571) loss: 0.7813 (0.7845) time: 0.2677 data: 0.0002 max mem: 26157 Train: [70] [1100/6250] eta: 0:23:24 lr: 0.000028 grad: 0.1733 (0.2581) loss: 0.7830 (0.7841) time: 0.2669 data: 0.0002 max mem: 26157 Train: [70] [1200/6250] eta: 0:22:54 lr: 0.000028 grad: 0.1688 (0.2600) loss: 0.7842 (0.7836) time: 0.2669 data: 0.0002 max mem: 26157 Train: [70] [1300/6250] eta: 0:22:25 lr: 0.000028 grad: 0.1732 (0.2581) loss: 0.7753 (0.7832) time: 0.2674 data: 0.0001 max mem: 26157 Train: [70] [1400/6250] eta: 0:21:56 lr: 0.000028 grad: 0.1649 (0.2567) loss: 0.7746 (0.7828) time: 0.2668 data: 0.0002 max mem: 26157 Train: [70] [1500/6250] eta: 0:21:28 lr: 0.000028 grad: 0.1700 (0.2555) loss: 0.7793 (0.7824) time: 0.2670 data: 0.0001 max mem: 26157 Train: [70] [1600/6250] eta: 0:21:00 lr: 0.000028 grad: 0.1757 (0.2597) loss: 0.7699 (0.7821) time: 0.2680 data: 0.0001 max mem: 26157 Train: [70] [1700/6250] eta: 0:20:31 lr: 0.000028 grad: 0.1798 (0.2606) loss: 0.7687 (0.7817) time: 0.2678 data: 0.0002 max mem: 26157 Train: [70] [1800/6250] eta: 0:20:04 lr: 0.000028 grad: 0.1790 (0.2603) loss: 0.7762 (0.7813) time: 0.2679 data: 0.0002 max mem: 26157 Train: [70] [1900/6250] eta: 0:19:36 lr: 0.000028 grad: 0.1643 (0.2589) loss: 0.7839 (0.7811) time: 0.2678 data: 0.0002 max mem: 26157 Train: [70] [2000/6250] eta: 0:19:08 lr: 0.000028 grad: 0.1774 (0.2573) loss: 0.7778 (0.7809) time: 0.2681 data: 0.0002 max mem: 26157 Train: [70] [2100/6250] eta: 0:18:41 lr: 0.000028 grad: 0.1813 (0.2571) loss: 0.7793 (0.7806) time: 0.2667 data: 0.0001 max mem: 26157 Train: [70] [2200/6250] eta: 0:18:13 lr: 0.000028 grad: 0.1726 (0.2561) loss: 0.7822 (0.7804) time: 0.2678 data: 0.0002 max mem: 26157 Train: [70] [2300/6250] eta: 0:17:46 lr: 0.000028 grad: 0.1750 (0.2584) loss: 0.7770 (0.7804) time: 0.2670 data: 0.0002 max mem: 26157 Train: [70] [2400/6250] eta: 0:17:18 lr: 0.000028 grad: 0.1849 (0.2581) loss: 0.7835 (0.7803) time: 0.2679 data: 0.0002 max mem: 26157 Train: [70] [2500/6250] eta: 0:16:51 lr: 0.000028 grad: 0.1728 (0.2578) loss: 0.7804 (0.7802) time: 0.2673 data: 0.0002 max mem: 26157 Train: [70] [2600/6250] eta: 0:16:24 lr: 0.000028 grad: 0.1731 (0.2585) loss: 0.7801 (0.7803) time: 0.2670 data: 0.0001 max mem: 26157 Train: [70] [2700/6250] eta: 0:15:56 lr: 0.000028 grad: 0.1713 (0.2596) loss: 0.7810 (0.7804) time: 0.2689 data: 0.0002 max mem: 26157 Train: [70] [2800/6250] eta: 0:15:29 lr: 0.000028 grad: 0.1790 (0.2591) loss: 0.7848 (0.7806) time: 0.2662 data: 0.0001 max mem: 26157 Train: [70] [2900/6250] eta: 0:15:02 lr: 0.000028 grad: 0.1717 (0.2604) loss: 0.7814 (0.7807) time: 0.2673 data: 0.0001 max mem: 26157 Train: [70] [3000/6250] eta: 0:14:35 lr: 0.000027 grad: 0.1748 (0.2602) loss: 0.7855 (0.7808) time: 0.2668 data: 0.0001 max mem: 26157 Train: [70] [3100/6250] eta: 0:14:08 lr: 0.000027 grad: 0.1769 (0.2596) loss: 0.7896 (0.7808) time: 0.2689 data: 0.0002 max mem: 26157 Train: [70] [3200/6250] eta: 0:13:41 lr: 0.000027 grad: 0.1701 (0.2600) loss: 0.7911 (0.7810) time: 0.2676 data: 0.0002 max mem: 26157 Train: [70] [3300/6250] eta: 0:13:14 lr: 0.000027 grad: 0.1817 (0.2604) loss: 0.7868 (0.7811) time: 0.2670 data: 0.0002 max mem: 26157 Train: [70] [3400/6250] eta: 0:12:46 lr: 0.000027 grad: 0.1716 (0.2587) loss: 0.7798 (0.7812) time: 0.2670 data: 0.0001 max mem: 26157 Train: [70] [3500/6250] eta: 0:12:20 lr: 0.000027 grad: 0.1671 (0.2603) loss: 0.7849 (0.7813) time: 0.2686 data: 0.0002 max mem: 26157 Train: [70] [3600/6250] eta: 0:11:52 lr: 0.000027 grad: 0.1720 (0.2598) loss: 0.7879 (0.7815) time: 0.2663 data: 0.0001 max mem: 26157 Train: [70] [3700/6250] eta: 0:11:25 lr: 0.000027 grad: 0.1900 (0.2593) loss: 0.7797 (0.7815) time: 0.2680 data: 0.0002 max mem: 26157 Train: [70] [3800/6250] eta: 0:10:58 lr: 0.000027 grad: 0.1743 (0.2597) loss: 0.7814 (0.7816) time: 0.2674 data: 0.0002 max mem: 26157 Train: [70] [3900/6250] eta: 0:10:31 lr: 0.000027 grad: 0.1855 (0.2608) loss: 0.7760 (0.7816) time: 0.2670 data: 0.0002 max mem: 26157 Train: [70] [4000/6250] eta: 0:10:04 lr: 0.000027 grad: 0.1821 (0.2617) loss: 0.7808 (0.7818) time: 0.2667 data: 0.0001 max mem: 26157 Train: [70] [4100/6250] eta: 0:09:38 lr: 0.000027 grad: 0.1740 (0.2620) loss: 0.7851 (0.7819) time: 0.2671 data: 0.0001 max mem: 26157 Train: [70] [4200/6250] eta: 0:09:11 lr: 0.000027 grad: 0.1733 (0.2626) loss: 0.7824 (0.7820) time: 0.2666 data: 0.0001 max mem: 26157 Train: [70] [4300/6250] eta: 0:08:44 lr: 0.000027 grad: 0.1680 (0.2616) loss: 0.7863 (0.7820) time: 0.2671 data: 0.0001 max mem: 26157 Train: [70] [4400/6250] eta: 0:08:17 lr: 0.000027 grad: 0.1716 (0.2630) loss: 0.7774 (0.7821) time: 0.2671 data: 0.0002 max mem: 26157 Train: [70] [4500/6250] eta: 0:07:50 lr: 0.000027 grad: 0.1884 (0.2629) loss: 0.7838 (0.7821) time: 0.2672 data: 0.0001 max mem: 26157 Train: [70] [4600/6250] eta: 0:07:23 lr: 0.000027 grad: 0.1803 (0.2642) loss: 0.7848 (0.7821) time: 0.2662 data: 0.0002 max mem: 26157 Train: [70] [4700/6250] eta: 0:06:56 lr: 0.000027 grad: 0.2300 (0.2663) loss: 0.7855 (0.7821) time: 0.2672 data: 0.0001 max mem: 26157 Train: [70] [4800/6250] eta: 0:06:29 lr: 0.000027 grad: 0.2003 (0.2672) loss: 0.7834 (0.7822) time: 0.2666 data: 0.0001 max mem: 26157 Train: [70] [4900/6250] eta: 0:06:02 lr: 0.000027 grad: 0.2095 (0.2688) loss: 0.7808 (0.7822) time: 0.2675 data: 0.0001 max mem: 26157 Train: [70] [5000/6250] eta: 0:05:35 lr: 0.000027 grad: 0.2251 (0.2696) loss: 0.7954 (0.7823) time: 0.2682 data: 0.0002 max mem: 26157 Train: [70] [5100/6250] eta: 0:05:08 lr: 0.000027 grad: 0.2442 (0.2703) loss: 0.7834 (0.7824) time: 0.2676 data: 0.0001 max mem: 26157 Train: [70] [5200/6250] eta: 0:04:41 lr: 0.000027 grad: 0.1858 (0.2701) loss: 0.7912 (0.7825) time: 0.2680 data: 0.0001 max mem: 26157 Train: [70] [5300/6250] eta: 0:04:15 lr: 0.000027 grad: 0.1914 (0.2702) loss: 0.7844 (0.7827) time: 0.2669 data: 0.0001 max mem: 26157 Train: [70] [5400/6250] eta: 0:03:48 lr: 0.000027 grad: 0.1761 (0.2703) loss: 0.7827 (0.7827) time: 0.2678 data: 0.0001 max mem: 26157 Train: [70] [5500/6250] eta: 0:03:21 lr: 0.000027 grad: 0.1753 (0.2703) loss: 0.7863 (0.7828) time: 0.2670 data: 0.0001 max mem: 26157 Train: [70] [5600/6250] eta: 0:02:54 lr: 0.000027 grad: 0.2347 (0.2700) loss: 0.7807 (0.7828) time: 0.2669 data: 0.0001 max mem: 26157 Train: [70] [5700/6250] eta: 0:02:27 lr: 0.000027 grad: 0.2242 (0.2702) loss: 0.7844 (0.7829) time: 0.2670 data: 0.0001 max mem: 26157 Train: [70] [5800/6250] eta: 0:02:00 lr: 0.000027 grad: 0.1831 (0.2699) loss: 0.7871 (0.7829) time: 0.2670 data: 0.0001 max mem: 26157 Train: [70] [5900/6250] eta: 0:01:33 lr: 0.000027 grad: 0.1816 (0.2697) loss: 0.7853 (0.7830) time: 0.2691 data: 0.0002 max mem: 26157 Train: [70] [6000/6250] eta: 0:01:07 lr: 0.000027 grad: 0.2050 (0.2700) loss: 0.7836 (0.7831) time: 0.2676 data: 0.0001 max mem: 26157 Train: [70] [6100/6250] eta: 0:00:40 lr: 0.000027 grad: 0.1761 (0.2695) loss: 0.7858 (0.7831) time: 0.2668 data: 0.0001 max mem: 26157 Train: [70] [6200/6250] eta: 0:00:13 lr: 0.000027 grad: 0.1846 (0.2689) loss: 0.7757 (0.7831) time: 0.2676 data: 0.0001 max mem: 26157 Train: [70] [6249/6250] eta: 0:00:00 lr: 0.000027 grad: 0.1838 (0.2684) loss: 0.7896 (0.7831) time: 0.2675 data: 0.0001 max mem: 26157 Train: [70] Total time: 0:27:59 (0.2688 s / it) Averaged stats: lr: 0.000027 grad: 0.1838 (0.2684) loss: 0.7896 (0.7831) Eval (hcp-train-subset): [70] [ 0/62] eta: 0:04:18 loss: 0.8167 (0.8167) time: 4.1708 data: 4.0881 max mem: 26157 Eval (hcp-train-subset): [70] [61/62] eta: 0:00:00 loss: 0.8029 (0.8069) time: 0.0888 data: 0.0063 max mem: 26157 Eval (hcp-train-subset): [70] Total time: 0:00:10 (0.1724 s / it) Averaged stats (hcp-train-subset): loss: 0.8029 (0.8069) Making plots (hcp-train-subset): example=31 Eval (hcp-val): [70] [ 0/62] eta: 0:02:47 loss: 0.8197 (0.8197) time: 2.6994 data: 2.5906 max mem: 26157 Eval (hcp-val): [70] [61/62] eta: 0:00:00 loss: 0.8244 (0.8255) time: 0.0912 data: 0.0090 max mem: 26157 Eval (hcp-val): [70] Total time: 0:00:10 (0.1721 s / it) Averaged stats (hcp-val): loss: 0.8244 (0.8255) Making plots (hcp-val): example=3 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [71] [ 0/6250] eta: 7:53:11 lr: 0.000027 grad: 0.6288 (0.6288) loss: 0.8223 (0.8223) time: 4.5426 data: 4.2690 max mem: 26157 Train: [71] [ 100/6250] eta: 0:32:08 lr: 0.000027 grad: 0.2224 (0.3828) loss: 0.8120 (0.8058) time: 0.2690 data: 0.0002 max mem: 26157 Train: [71] [ 200/6250] eta: 0:29:17 lr: 0.000027 grad: 0.2531 (0.3519) loss: 0.7995 (0.8008) time: 0.2677 data: 0.0001 max mem: 26157 Train: [71] [ 300/6250] eta: 0:28:03 lr: 0.000027 grad: 0.1851 (0.3170) loss: 0.7972 (0.7980) time: 0.2676 data: 0.0001 max mem: 26157 Train: [71] [ 400/6250] eta: 0:27:12 lr: 0.000026 grad: 0.1976 (0.3080) loss: 0.7908 (0.7963) time: 0.2677 data: 0.0001 max mem: 26157 Train: [71] [ 500/6250] eta: 0:26:30 lr: 0.000026 grad: 0.1804 (0.2902) loss: 0.7882 (0.7954) time: 0.2677 data: 0.0001 max mem: 26157 Train: [71] [ 600/6250] eta: 0:25:54 lr: 0.000026 grad: 0.1759 (0.2934) loss: 0.7929 (0.7948) time: 0.2668 data: 0.0001 max mem: 26157 Train: [71] [ 700/6250] eta: 0:25:20 lr: 0.000026 grad: 0.1750 (0.2886) loss: 0.7901 (0.7938) time: 0.2676 data: 0.0001 max mem: 26157 Train: [71] [ 800/6250] eta: 0:24:49 lr: 0.000026 grad: 0.1833 (0.2787) loss: 0.7868 (0.7932) time: 0.2678 data: 0.0001 max mem: 26157 Train: [71] [ 900/6250] eta: 0:24:18 lr: 0.000026 grad: 0.1869 (0.2721) loss: 0.7886 (0.7925) time: 0.2674 data: 0.0001 max mem: 26157 Train: [71] [1000/6250] eta: 0:23:48 lr: 0.000026 grad: 0.2022 (0.2721) loss: 0.7847 (0.7918) time: 0.2678 data: 0.0001 max mem: 26157 Train: [71] [1100/6250] eta: 0:23:19 lr: 0.000026 grad: 0.1825 (0.2685) loss: 0.7767 (0.7910) time: 0.2679 data: 0.0002 max mem: 26157 Train: [71] [1200/6250] eta: 0:22:50 lr: 0.000026 grad: 0.1822 (0.2703) loss: 0.7869 (0.7904) time: 0.2676 data: 0.0002 max mem: 26157 Train: [71] [1300/6250] eta: 0:22:21 lr: 0.000026 grad: 0.1818 (0.2697) loss: 0.7799 (0.7898) time: 0.2668 data: 0.0001 max mem: 26157 Train: [71] [1400/6250] eta: 0:21:52 lr: 0.000026 grad: 0.1781 (0.2691) loss: 0.7793 (0.7892) time: 0.2686 data: 0.0002 max mem: 26157 Train: [71] [1500/6250] eta: 0:21:24 lr: 0.000026 grad: 0.1735 (0.2681) loss: 0.7854 (0.7889) time: 0.2668 data: 0.0001 max mem: 26157 Train: [71] [1600/6250] eta: 0:20:56 lr: 0.000026 grad: 0.1754 (0.2662) loss: 0.7806 (0.7886) time: 0.2669 data: 0.0001 max mem: 26157 Train: [71] [1700/6250] eta: 0:20:28 lr: 0.000026 grad: 0.1764 (0.2628) loss: 0.7823 (0.7882) time: 0.2672 data: 0.0002 max mem: 26157 Train: [71] [1800/6250] eta: 0:20:01 lr: 0.000026 grad: 0.1766 (0.2609) loss: 0.7856 (0.7878) time: 0.2667 data: 0.0001 max mem: 26157 Train: [71] [1900/6250] eta: 0:19:33 lr: 0.000026 grad: 0.1709 (0.2585) loss: 0.7852 (0.7876) time: 0.2663 data: 0.0001 max mem: 26157 Train: [71] [2000/6250] eta: 0:19:05 lr: 0.000026 grad: 0.1679 (0.2586) loss: 0.7749 (0.7874) time: 0.2668 data: 0.0001 max mem: 26157 Train: [71] [2100/6250] eta: 0:18:38 lr: 0.000026 grad: 0.1664 (0.2564) loss: 0.7829 (0.7871) time: 0.2673 data: 0.0001 max mem: 26157 Train: [71] [2200/6250] eta: 0:18:10 lr: 0.000026 grad: 0.1791 (0.2549) loss: 0.7726 (0.7868) time: 0.2672 data: 0.0001 max mem: 26157 Train: [71] [2300/6250] eta: 0:17:43 lr: 0.000026 grad: 0.1810 (0.2542) loss: 0.7721 (0.7865) time: 0.2687 data: 0.0001 max mem: 26157 Train: [71] [2400/6250] eta: 0:17:16 lr: 0.000026 grad: 0.1807 (0.2535) loss: 0.7732 (0.7861) time: 0.2676 data: 0.0002 max mem: 26157 Train: [71] [2500/6250] eta: 0:16:49 lr: 0.000026 grad: 0.1711 (0.2530) loss: 0.7813 (0.7859) time: 0.2679 data: 0.0002 max mem: 26157 Train: [71] [2600/6250] eta: 0:16:22 lr: 0.000026 grad: 0.1698 (0.2553) loss: 0.7862 (0.7856) time: 0.2681 data: 0.0002 max mem: 26157 Train: [71] [2700/6250] eta: 0:15:55 lr: 0.000026 grad: 0.1798 (0.2552) loss: 0.7804 (0.7853) time: 0.2677 data: 0.0001 max mem: 26157 Train: [71] [2800/6250] eta: 0:15:27 lr: 0.000026 grad: 0.1603 (0.2558) loss: 0.7856 (0.7852) time: 0.2668 data: 0.0001 max mem: 26157 Train: [71] [2900/6250] eta: 0:15:00 lr: 0.000026 grad: 0.1747 (0.2543) loss: 0.7821 (0.7850) time: 0.2663 data: 0.0001 max mem: 26157 Train: [71] [3000/6250] eta: 0:14:33 lr: 0.000026 grad: 0.1717 (0.2538) loss: 0.7796 (0.7848) time: 0.2664 data: 0.0001 max mem: 26157 Train: [71] [3100/6250] eta: 0:14:06 lr: 0.000026 grad: 0.1807 (0.2535) loss: 0.7811 (0.7847) time: 0.2715 data: 0.0001 max mem: 26157 Train: [71] [3200/6250] eta: 0:13:39 lr: 0.000026 grad: 0.1684 (0.2539) loss: 0.7825 (0.7846) time: 0.2672 data: 0.0001 max mem: 26157 Train: [71] [3300/6250] eta: 0:13:12 lr: 0.000026 grad: 0.1675 (0.2538) loss: 0.7799 (0.7844) time: 0.2668 data: 0.0001 max mem: 26157 Train: [71] [3400/6250] eta: 0:12:45 lr: 0.000026 grad: 0.1709 (0.2549) loss: 0.7825 (0.7844) time: 0.2673 data: 0.0001 max mem: 26157 Train: [71] [3500/6250] eta: 0:12:18 lr: 0.000026 grad: 0.1686 (0.2541) loss: 0.7828 (0.7844) time: 0.2669 data: 0.0001 max mem: 26157 Train: [71] [3600/6250] eta: 0:11:51 lr: 0.000026 grad: 0.1696 (0.2533) loss: 0.7863 (0.7845) time: 0.2666 data: 0.0001 max mem: 26157 Train: [71] [3700/6250] eta: 0:11:24 lr: 0.000026 grad: 0.1719 (0.2540) loss: 0.7842 (0.7845) time: 0.2668 data: 0.0001 max mem: 26157 Train: [71] [3800/6250] eta: 0:10:57 lr: 0.000026 grad: 0.1828 (0.2539) loss: 0.7849 (0.7845) time: 0.2666 data: 0.0001 max mem: 26157 Train: [71] [3900/6250] eta: 0:10:30 lr: 0.000026 grad: 0.1679 (0.2535) loss: 0.7840 (0.7846) time: 0.2671 data: 0.0002 max mem: 26157 Train: [71] [4000/6250] eta: 0:10:03 lr: 0.000026 grad: 0.1701 (0.2527) loss: 0.7842 (0.7847) time: 0.2668 data: 0.0001 max mem: 26157 Train: [71] [4100/6250] eta: 0:09:37 lr: 0.000026 grad: 0.2025 (0.2540) loss: 0.7829 (0.7847) time: 0.2664 data: 0.0001 max mem: 26157 Train: [71] [4200/6250] eta: 0:09:10 lr: 0.000025 grad: 0.1743 (0.2546) loss: 0.7918 (0.7847) time: 0.2672 data: 0.0001 max mem: 26157 Train: [71] [4300/6250] eta: 0:08:43 lr: 0.000025 grad: 0.1791 (0.2537) loss: 0.7840 (0.7847) time: 0.2673 data: 0.0001 max mem: 26157 Train: [71] [4400/6250] eta: 0:08:16 lr: 0.000025 grad: 0.1868 (0.2531) loss: 0.7878 (0.7847) time: 0.2665 data: 0.0001 max mem: 26157 Train: [71] [4500/6250] eta: 0:07:49 lr: 0.000025 grad: 0.1656 (0.2534) loss: 0.7952 (0.7848) time: 0.2666 data: 0.0001 max mem: 26157 Train: [71] [4600/6250] eta: 0:07:22 lr: 0.000025 grad: 0.1745 (0.2534) loss: 0.7781 (0.7848) time: 0.2669 data: 0.0001 max mem: 26157 Train: [71] [4700/6250] eta: 0:06:55 lr: 0.000025 grad: 0.1702 (0.2536) loss: 0.7840 (0.7848) time: 0.2674 data: 0.0001 max mem: 26157 Train: [71] [4800/6250] eta: 0:06:28 lr: 0.000025 grad: 0.1776 (0.2532) loss: 0.7816 (0.7848) time: 0.2677 data: 0.0001 max mem: 26157 Train: [71] [4900/6250] eta: 0:06:02 lr: 0.000025 grad: 0.1742 (0.2529) loss: 0.7846 (0.7847) time: 0.2669 data: 0.0002 max mem: 26157 Train: [71] [5000/6250] eta: 0:05:35 lr: 0.000025 grad: 0.1765 (0.2534) loss: 0.7793 (0.7847) time: 0.2668 data: 0.0001 max mem: 26157 Train: [71] [5100/6250] eta: 0:05:08 lr: 0.000025 grad: 0.2049 (0.2535) loss: 0.7879 (0.7847) time: 0.2672 data: 0.0001 max mem: 26157 Train: [71] [5200/6250] eta: 0:04:41 lr: 0.000025 grad: 0.1969 (0.2537) loss: 0.7850 (0.7847) time: 0.2674 data: 0.0001 max mem: 26157 Train: [71] [5300/6250] eta: 0:04:14 lr: 0.000025 grad: 0.1752 (0.2536) loss: 0.7802 (0.7846) time: 0.2671 data: 0.0001 max mem: 26157 Train: [71] [5400/6250] eta: 0:03:47 lr: 0.000025 grad: 0.1772 (0.2542) loss: 0.7858 (0.7845) time: 0.2670 data: 0.0001 max mem: 26157 Train: [71] [5500/6250] eta: 0:03:21 lr: 0.000025 grad: 0.2000 (0.2564) loss: 0.7813 (0.7844) time: 0.2674 data: 0.0001 max mem: 26157 Train: [71] [5600/6250] eta: 0:02:54 lr: 0.000025 grad: 0.1805 (0.2563) loss: 0.7794 (0.7843) time: 0.2670 data: 0.0001 max mem: 26157 Train: [71] [5700/6250] eta: 0:02:27 lr: 0.000025 grad: 0.1716 (0.2573) loss: 0.7847 (0.7843) time: 0.2670 data: 0.0001 max mem: 26157 Train: [71] [5800/6250] eta: 0:02:00 lr: 0.000025 grad: 0.1910 (0.2577) loss: 0.7772 (0.7842) time: 0.2673 data: 0.0001 max mem: 26157 Train: [71] [5900/6250] eta: 0:01:33 lr: 0.000025 grad: 0.1921 (0.2591) loss: 0.7782 (0.7841) time: 0.2661 data: 0.0001 max mem: 26157 Train: [71] [6000/6250] eta: 0:01:06 lr: 0.000025 grad: 0.1778 (0.2594) loss: 0.7713 (0.7839) time: 0.2673 data: 0.0002 max mem: 26157 Train: [71] [6100/6250] eta: 0:00:40 lr: 0.000025 grad: 0.1773 (0.2594) loss: 0.7773 (0.7838) time: 0.2677 data: 0.0001 max mem: 26157 Train: [71] [6200/6250] eta: 0:00:13 lr: 0.000025 grad: 0.1785 (0.2589) loss: 0.7753 (0.7838) time: 0.2674 data: 0.0001 max mem: 26157 Train: [71] [6249/6250] eta: 0:00:00 lr: 0.000025 grad: 0.1794 (0.2590) loss: 0.7902 (0.7838) time: 0.2666 data: 0.0001 max mem: 26157 Train: [71] Total time: 0:27:57 (0.2684 s / it) Averaged stats: lr: 0.000025 grad: 0.1794 (0.2590) loss: 0.7902 (0.7838) Eval (hcp-train-subset): [71] [ 0/62] eta: 0:03:52 loss: 0.8169 (0.8169) time: 3.7569 data: 3.6734 max mem: 26157 Eval (hcp-train-subset): [71] [61/62] eta: 0:00:00 loss: 0.8046 (0.8080) time: 0.0868 data: 0.0047 max mem: 26157 Eval (hcp-train-subset): [71] Total time: 0:00:10 (0.1663 s / it) Averaged stats (hcp-train-subset): loss: 0.8046 (0.8080) Making plots (hcp-train-subset): example=42 Eval (hcp-val): [71] [ 0/62] eta: 0:03:44 loss: 0.8197 (0.8197) time: 3.6171 data: 3.5336 max mem: 26157 Eval (hcp-val): [71] [61/62] eta: 0:00:00 loss: 0.8220 (0.8244) time: 0.0913 data: 0.0090 max mem: 26157 Eval (hcp-val): [71] Total time: 0:00:10 (0.1659 s / it) Averaged stats (hcp-val): loss: 0.8220 (0.8244) Making plots (hcp-val): example=33 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [72] [ 0/6250] eta: 5:58:08 lr: 0.000025 grad: 0.1538 (0.1538) loss: 0.8289 (0.8289) time: 3.4381 data: 3.0965 max mem: 26157 Train: [72] [ 100/6250] eta: 0:32:00 lr: 0.000025 grad: 0.2239 (0.3139) loss: 0.7890 (0.7995) time: 0.2684 data: 0.0002 max mem: 26157 Train: [72] [ 200/6250] eta: 0:29:13 lr: 0.000025 grad: 0.1941 (0.2794) loss: 0.7937 (0.7953) time: 0.2670 data: 0.0001 max mem: 26157 Train: [72] [ 300/6250] eta: 0:28:00 lr: 0.000025 grad: 0.1867 (0.2649) loss: 0.7926 (0.7920) time: 0.2686 data: 0.0002 max mem: 26157 Train: [72] [ 400/6250] eta: 0:27:10 lr: 0.000025 grad: 0.2074 (0.2596) loss: 0.7782 (0.7904) time: 0.2668 data: 0.0002 max mem: 26157 Train: [72] [ 500/6250] eta: 0:26:28 lr: 0.000025 grad: 0.1882 (0.2561) loss: 0.7862 (0.7897) time: 0.2675 data: 0.0001 max mem: 26157 Train: [72] [ 600/6250] eta: 0:25:53 lr: 0.000025 grad: 0.1771 (0.2502) loss: 0.7879 (0.7894) time: 0.2675 data: 0.0002 max mem: 26157 Train: [72] [ 700/6250] eta: 0:25:19 lr: 0.000025 grad: 0.1869 (0.2458) loss: 0.7838 (0.7891) time: 0.2677 data: 0.0001 max mem: 26157 Train: [72] [ 800/6250] eta: 0:24:47 lr: 0.000025 grad: 0.1658 (0.2454) loss: 0.7843 (0.7892) time: 0.2676 data: 0.0001 max mem: 26157 Train: [72] [ 900/6250] eta: 0:24:17 lr: 0.000025 grad: 0.1762 (0.2419) loss: 0.7894 (0.7891) time: 0.2676 data: 0.0001 max mem: 26157 Train: [72] [1000/6250] eta: 0:23:48 lr: 0.000025 grad: 0.1755 (0.2372) loss: 0.7901 (0.7892) time: 0.2685 data: 0.0002 max mem: 26157 Train: [72] [1100/6250] eta: 0:23:19 lr: 0.000025 grad: 0.1735 (0.2405) loss: 0.7900 (0.7888) time: 0.2672 data: 0.0002 max mem: 26157 Train: [72] [1200/6250] eta: 0:22:49 lr: 0.000025 grad: 0.1709 (0.2419) loss: 0.7842 (0.7884) time: 0.2666 data: 0.0001 max mem: 26157 Train: [72] [1300/6250] eta: 0:22:21 lr: 0.000025 grad: 0.1763 (0.2423) loss: 0.7877 (0.7879) time: 0.2676 data: 0.0001 max mem: 26157 Train: [72] [1400/6250] eta: 0:21:52 lr: 0.000025 grad: 0.1742 (0.2446) loss: 0.7798 (0.7875) time: 0.2680 data: 0.0002 max mem: 26157 Train: [72] [1500/6250] eta: 0:21:24 lr: 0.000025 grad: 0.1752 (0.2450) loss: 0.7793 (0.7872) time: 0.2669 data: 0.0002 max mem: 26157 Train: [72] [1600/6250] eta: 0:20:56 lr: 0.000025 grad: 0.1732 (0.2455) loss: 0.7812 (0.7868) time: 0.2674 data: 0.0001 max mem: 26157 Train: [72] [1700/6250] eta: 0:20:29 lr: 0.000024 grad: 0.1749 (0.2508) loss: 0.7748 (0.7865) time: 0.2692 data: 0.0002 max mem: 26157 Train: [72] [1800/6250] eta: 0:20:01 lr: 0.000024 grad: 0.1671 (0.2533) loss: 0.7778 (0.7863) time: 0.2673 data: 0.0001 max mem: 26157 Train: [72] [1900/6250] eta: 0:19:33 lr: 0.000024 grad: 0.1864 (0.2528) loss: 0.7829 (0.7860) time: 0.2669 data: 0.0002 max mem: 26157 Train: [72] [2000/6250] eta: 0:19:06 lr: 0.000024 grad: 0.1753 (0.2511) loss: 0.7879 (0.7859) time: 0.2674 data: 0.0002 max mem: 26157 Train: [72] [2100/6250] eta: 0:18:38 lr: 0.000024 grad: 0.1739 (0.2520) loss: 0.7780 (0.7857) time: 0.2667 data: 0.0002 max mem: 26157 Train: [72] [2200/6250] eta: 0:18:11 lr: 0.000024 grad: 0.1916 (0.2513) loss: 0.7839 (0.7854) time: 0.2678 data: 0.0001 max mem: 26157 Train: [72] [2300/6250] eta: 0:17:43 lr: 0.000024 grad: 0.1774 (0.2511) loss: 0.7777 (0.7851) time: 0.2668 data: 0.0001 max mem: 26157 Train: [72] [2400/6250] eta: 0:17:16 lr: 0.000024 grad: 0.1901 (0.2499) loss: 0.7738 (0.7849) time: 0.2678 data: 0.0002 max mem: 26157 Train: [72] [2500/6250] eta: 0:16:49 lr: 0.000024 grad: 0.1676 (0.2496) loss: 0.7714 (0.7846) time: 0.2673 data: 0.0002 max mem: 26157 Train: [72] [2600/6250] eta: 0:16:22 lr: 0.000024 grad: 0.1788 (0.2496) loss: 0.7796 (0.7843) time: 0.2669 data: 0.0001 max mem: 26157 Train: [72] [2700/6250] eta: 0:15:55 lr: 0.000024 grad: 0.1787 (0.2500) loss: 0.7741 (0.7840) time: 0.2665 data: 0.0001 max mem: 26157 Train: [72] [2800/6250] eta: 0:15:27 lr: 0.000024 grad: 0.1718 (0.2509) loss: 0.7864 (0.7836) time: 0.2671 data: 0.0001 max mem: 26157 Train: [72] [2900/6250] eta: 0:15:00 lr: 0.000024 grad: 0.1975 (0.2518) loss: 0.7730 (0.7832) time: 0.2676 data: 0.0001 max mem: 26157 Train: [72] [3000/6250] eta: 0:14:33 lr: 0.000024 grad: 0.1947 (0.2511) loss: 0.7733 (0.7830) time: 0.2685 data: 0.0001 max mem: 26157 Train: [72] [3100/6250] eta: 0:14:06 lr: 0.000024 grad: 0.1790 (0.2516) loss: 0.7723 (0.7827) time: 0.2671 data: 0.0002 max mem: 26157 Train: [72] [3200/6250] eta: 0:13:40 lr: 0.000024 grad: 0.2004 (0.2538) loss: 0.7783 (0.7825) time: 0.2678 data: 0.0001 max mem: 26157 Train: [72] [3300/6250] eta: 0:13:13 lr: 0.000024 grad: 0.1848 (0.2527) loss: 0.7733 (0.7823) time: 0.2675 data: 0.0002 max mem: 26157 Train: [72] [3400/6250] eta: 0:12:46 lr: 0.000024 grad: 0.1629 (0.2530) loss: 0.7837 (0.7822) time: 0.2671 data: 0.0001 max mem: 26157 Train: [72] [3500/6250] eta: 0:12:19 lr: 0.000024 grad: 0.1866 (0.2534) loss: 0.7774 (0.7821) time: 0.2670 data: 0.0001 max mem: 26157 Train: [72] [3600/6250] eta: 0:11:52 lr: 0.000024 grad: 0.1762 (0.2530) loss: 0.7836 (0.7820) time: 0.2672 data: 0.0001 max mem: 26157 Train: [72] [3700/6250] eta: 0:11:25 lr: 0.000024 grad: 0.1907 (0.2528) loss: 0.7768 (0.7819) time: 0.2684 data: 0.0002 max mem: 26157 Train: [72] [3800/6250] eta: 0:10:58 lr: 0.000024 grad: 0.1850 (0.2530) loss: 0.7783 (0.7818) time: 0.2689 data: 0.0002 max mem: 26157 Train: [72] [3900/6250] eta: 0:10:31 lr: 0.000024 grad: 0.1769 (0.2570) loss: 0.7782 (0.7817) time: 0.2674 data: 0.0001 max mem: 26157 Train: [72] [4000/6250] eta: 0:10:04 lr: 0.000024 grad: 0.1853 (0.2575) loss: 0.7803 (0.7815) time: 0.2672 data: 0.0001 max mem: 26157 Train: [72] [4100/6250] eta: 0:09:37 lr: 0.000024 grad: 0.1841 (0.2575) loss: 0.7757 (0.7814) time: 0.2682 data: 0.0002 max mem: 26157 Train: [72] [4200/6250] eta: 0:09:10 lr: 0.000024 grad: 0.1817 (0.2565) loss: 0.7750 (0.7812) time: 0.2677 data: 0.0001 max mem: 26157 Train: [72] [4300/6250] eta: 0:08:43 lr: 0.000024 grad: 0.2171 (0.2570) loss: 0.7769 (0.7811) time: 0.2685 data: 0.0002 max mem: 26157 Train: [72] [4400/6250] eta: 0:08:16 lr: 0.000024 grad: 0.1959 (0.2584) loss: 0.7800 (0.7810) time: 0.2681 data: 0.0001 max mem: 26157 Train: [72] [4500/6250] eta: 0:07:50 lr: 0.000024 grad: 0.1857 (0.2585) loss: 0.7815 (0.7809) time: 0.2685 data: 0.0002 max mem: 26157 Train: [72] [4600/6250] eta: 0:07:23 lr: 0.000024 grad: 0.1811 (0.2581) loss: 0.7729 (0.7808) time: 0.2689 data: 0.0002 max mem: 26157 Train: [72] [4700/6250] eta: 0:06:56 lr: 0.000024 grad: 0.2137 (0.2609) loss: 0.7853 (0.7807) time: 0.2675 data: 0.0002 max mem: 26157 Train: [72] [4800/6250] eta: 0:06:29 lr: 0.000024 grad: 0.1722 (0.2612) loss: 0.7734 (0.7806) time: 0.2665 data: 0.0001 max mem: 26157 Train: [72] [4900/6250] eta: 0:06:02 lr: 0.000024 grad: 0.1866 (0.2630) loss: 0.7779 (0.7805) time: 0.2668 data: 0.0001 max mem: 26157 Train: [72] [5000/6250] eta: 0:05:35 lr: 0.000024 grad: 0.1984 (0.2631) loss: 0.7771 (0.7805) time: 0.2673 data: 0.0002 max mem: 26157 Train: [72] [5100/6250] eta: 0:05:08 lr: 0.000024 grad: 0.1833 (0.2632) loss: 0.7663 (0.7804) time: 0.2674 data: 0.0001 max mem: 26157 Train: [72] [5200/6250] eta: 0:04:41 lr: 0.000024 grad: 0.1802 (0.2628) loss: 0.7762 (0.7804) time: 0.2706 data: 0.0002 max mem: 26157 Train: [72] [5300/6250] eta: 0:04:14 lr: 0.000024 grad: 0.1858 (0.2633) loss: 0.7869 (0.7803) time: 0.2667 data: 0.0001 max mem: 26157 Train: [72] [5400/6250] eta: 0:03:48 lr: 0.000024 grad: 0.1817 (0.2628) loss: 0.7811 (0.7803) time: 0.2668 data: 0.0001 max mem: 26157 Train: [72] [5500/6250] eta: 0:03:21 lr: 0.000023 grad: 0.1738 (0.2643) loss: 0.7818 (0.7803) time: 0.2667 data: 0.0001 max mem: 26157 Train: [72] [5600/6250] eta: 0:02:54 lr: 0.000023 grad: 0.1787 (0.2637) loss: 0.7742 (0.7803) time: 0.2679 data: 0.0001 max mem: 26157 Train: [72] [5700/6250] eta: 0:02:27 lr: 0.000023 grad: 0.1873 (0.2629) loss: 0.7777 (0.7803) time: 0.2677 data: 0.0001 max mem: 26157 Train: [72] [5800/6250] eta: 0:02:00 lr: 0.000023 grad: 0.1686 (0.2622) loss: 0.7847 (0.7803) time: 0.2685 data: 0.0002 max mem: 26157 Train: [72] [5900/6250] eta: 0:01:33 lr: 0.000023 grad: 0.1753 (0.2628) loss: 0.7794 (0.7803) time: 0.2671 data: 0.0002 max mem: 26157 Train: [72] [6000/6250] eta: 0:01:07 lr: 0.000023 grad: 0.1799 (0.2629) loss: 0.7807 (0.7803) time: 0.2675 data: 0.0001 max mem: 26157 Train: [72] [6100/6250] eta: 0:00:40 lr: 0.000023 grad: 0.1868 (0.2636) loss: 0.7724 (0.7803) time: 0.2668 data: 0.0002 max mem: 26157 Train: [72] [6200/6250] eta: 0:00:13 lr: 0.000023 grad: 0.1910 (0.2634) loss: 0.7728 (0.7804) time: 0.2677 data: 0.0001 max mem: 26157 Train: [72] [6249/6250] eta: 0:00:00 lr: 0.000023 grad: 0.1713 (0.2639) loss: 0.7775 (0.7804) time: 0.2680 data: 0.0002 max mem: 26157 Train: [72] Total time: 0:27:59 (0.2687 s / it) Averaged stats: lr: 0.000023 grad: 0.1713 (0.2639) loss: 0.7775 (0.7804) Eval (hcp-train-subset): [72] [ 0/62] eta: 0:02:58 loss: 0.8113 (0.8113) time: 2.8793 data: 2.7839 max mem: 26157 Eval (hcp-train-subset): [72] [61/62] eta: 0:00:00 loss: 0.8032 (0.8052) time: 0.1067 data: 0.0245 max mem: 26157 Eval (hcp-train-subset): [72] Total time: 0:00:10 (0.1694 s / it) Averaged stats (hcp-train-subset): loss: 0.8032 (0.8052) Making plots (hcp-train-subset): example=41 Eval (hcp-val): [72] [ 0/62] eta: 0:04:41 loss: 0.8223 (0.8223) time: 4.5332 data: 4.4502 max mem: 26157 Eval (hcp-val): [72] [61/62] eta: 0:00:00 loss: 0.8233 (0.8239) time: 0.0822 data: 0.0001 max mem: 26157 Eval (hcp-val): [72] Total time: 0:00:10 (0.1711 s / it) Averaged stats (hcp-val): loss: 0.8233 (0.8239) Making plots (hcp-val): example=38 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [73] [ 0/6250] eta: 8:16:09 lr: 0.000023 grad: 0.1818 (0.1818) loss: 0.8515 (0.8515) time: 4.7631 data: 4.4778 max mem: 26157 Train: [73] [ 100/6250] eta: 0:32:32 lr: 0.000023 grad: 0.2206 (0.2773) loss: 0.8116 (0.8107) time: 0.2675 data: 0.0002 max mem: 26157 Train: [73] [ 200/6250] eta: 0:29:30 lr: 0.000023 grad: 0.2015 (0.2771) loss: 0.7890 (0.8043) time: 0.2681 data: 0.0002 max mem: 26157 Train: [73] [ 300/6250] eta: 0:28:11 lr: 0.000023 grad: 0.2019 (0.2747) loss: 0.7881 (0.7996) time: 0.2668 data: 0.0002 max mem: 26157 Train: [73] [ 400/6250] eta: 0:27:17 lr: 0.000023 grad: 0.1994 (0.2661) loss: 0.7830 (0.7959) time: 0.2668 data: 0.0001 max mem: 26157 Train: [73] [ 500/6250] eta: 0:26:34 lr: 0.000023 grad: 0.1788 (0.2603) loss: 0.7907 (0.7939) time: 0.2665 data: 0.0001 max mem: 26157 Train: [73] [ 600/6250] eta: 0:25:57 lr: 0.000023 grad: 0.1923 (0.2694) loss: 0.7888 (0.7931) time: 0.2675 data: 0.0001 max mem: 26157 Train: [73] [ 700/6250] eta: 0:25:23 lr: 0.000023 grad: 0.1777 (0.2690) loss: 0.7855 (0.7923) time: 0.2672 data: 0.0002 max mem: 26157 Train: [73] [ 800/6250] eta: 0:24:50 lr: 0.000023 grad: 0.1735 (0.2685) loss: 0.7889 (0.7918) time: 0.2671 data: 0.0001 max mem: 26157 Train: [73] [ 900/6250] eta: 0:24:19 lr: 0.000023 grad: 0.1917 (0.2678) loss: 0.7872 (0.7913) time: 0.2666 data: 0.0001 max mem: 26157 Train: [73] [1000/6250] eta: 0:23:49 lr: 0.000023 grad: 0.1970 (0.2742) loss: 0.7885 (0.7910) time: 0.2664 data: 0.0001 max mem: 26157 Train: [73] [1100/6250] eta: 0:23:19 lr: 0.000023 grad: 0.1683 (0.2704) loss: 0.7838 (0.7907) time: 0.2690 data: 0.0002 max mem: 26157 Train: [73] [1200/6250] eta: 0:22:50 lr: 0.000023 grad: 0.1686 (0.2758) loss: 0.7877 (0.7906) time: 0.2670 data: 0.0002 max mem: 26157 Train: [73] [1300/6250] eta: 0:22:22 lr: 0.000023 grad: 0.1839 (0.2774) loss: 0.7815 (0.7903) time: 0.2674 data: 0.0001 max mem: 26157 Train: [73] [1400/6250] eta: 0:21:53 lr: 0.000023 grad: 0.1738 (0.2763) loss: 0.7848 (0.7898) time: 0.2671 data: 0.0001 max mem: 26157 Train: [73] [1500/6250] eta: 0:21:25 lr: 0.000023 grad: 0.1778 (0.2792) loss: 0.7914 (0.7896) time: 0.2668 data: 0.0001 max mem: 26157 Train: [73] [1600/6250] eta: 0:20:57 lr: 0.000023 grad: 0.1799 (0.2807) loss: 0.7907 (0.7894) time: 0.2665 data: 0.0001 max mem: 26157 Train: [73] [1700/6250] eta: 0:20:29 lr: 0.000023 grad: 0.1793 (0.2776) loss: 0.7826 (0.7893) time: 0.2670 data: 0.0001 max mem: 26157 Train: [73] [1800/6250] eta: 0:20:01 lr: 0.000023 grad: 0.1669 (0.2785) loss: 0.7862 (0.7891) time: 0.2674 data: 0.0001 max mem: 26157 Train: [73] [1900/6250] eta: 0:19:34 lr: 0.000023 grad: 0.1867 (0.2763) loss: 0.7858 (0.7890) time: 0.2677 data: 0.0002 max mem: 26157 Train: [73] [2000/6250] eta: 0:19:06 lr: 0.000023 grad: 0.1780 (0.2737) loss: 0.7850 (0.7887) time: 0.2671 data: 0.0001 max mem: 26157 Train: [73] [2100/6250] eta: 0:18:39 lr: 0.000023 grad: 0.1921 (0.2772) loss: 0.7842 (0.7884) time: 0.2675 data: 0.0002 max mem: 26157 Train: [73] [2200/6250] eta: 0:18:11 lr: 0.000023 grad: 0.1760 (0.2737) loss: 0.7855 (0.7883) time: 0.2675 data: 0.0001 max mem: 26157 Train: [73] [2300/6250] eta: 0:17:44 lr: 0.000023 grad: 0.1785 (0.2733) loss: 0.7828 (0.7881) time: 0.2676 data: 0.0001 max mem: 26157 Train: [73] [2400/6250] eta: 0:17:16 lr: 0.000023 grad: 0.1771 (0.2719) loss: 0.7829 (0.7880) time: 0.2673 data: 0.0002 max mem: 26157 Train: [73] [2500/6250] eta: 0:16:49 lr: 0.000023 grad: 0.1706 (0.2704) loss: 0.7828 (0.7878) time: 0.2669 data: 0.0002 max mem: 26157 Train: [73] [2600/6250] eta: 0:16:22 lr: 0.000023 grad: 0.1735 (0.2692) loss: 0.7875 (0.7876) time: 0.2678 data: 0.0001 max mem: 26157 Train: [73] [2700/6250] eta: 0:15:55 lr: 0.000023 grad: 0.1837 (0.2710) loss: 0.7804 (0.7875) time: 0.2668 data: 0.0002 max mem: 26157 Train: [73] [2800/6250] eta: 0:15:28 lr: 0.000023 grad: 0.1821 (0.2702) loss: 0.7816 (0.7872) time: 0.2668 data: 0.0001 max mem: 26157 Train: [73] [2900/6250] eta: 0:15:01 lr: 0.000023 grad: 0.1746 (0.2704) loss: 0.7803 (0.7869) time: 0.2705 data: 0.0002 max mem: 26157 Train: [73] [3000/6250] eta: 0:14:34 lr: 0.000023 grad: 0.1738 (0.2771) loss: 0.7820 (0.7868) time: 0.2665 data: 0.0002 max mem: 26157 Train: [73] [3100/6250] eta: 0:14:07 lr: 0.000023 grad: 0.1788 (0.2767) loss: 0.7859 (0.7865) time: 0.2691 data: 0.0002 max mem: 26157 Train: [73] [3200/6250] eta: 0:13:40 lr: 0.000022 grad: 0.1789 (0.2774) loss: 0.7805 (0.7863) time: 0.2668 data: 0.0002 max mem: 26157 Train: [73] [3300/6250] eta: 0:13:12 lr: 0.000022 grad: 0.1886 (0.2776) loss: 0.7811 (0.7861) time: 0.2674 data: 0.0002 max mem: 26157 Train: [73] [3400/6250] eta: 0:12:45 lr: 0.000022 grad: 0.1791 (0.2756) loss: 0.7736 (0.7860) time: 0.2671 data: 0.0002 max mem: 26157 Train: [73] [3500/6250] eta: 0:12:18 lr: 0.000022 grad: 0.1887 (0.2745) loss: 0.7780 (0.7858) time: 0.2670 data: 0.0001 max mem: 26157 Train: [73] [3600/6250] eta: 0:11:51 lr: 0.000022 grad: 0.1743 (0.2732) loss: 0.7853 (0.7857) time: 0.2666 data: 0.0001 max mem: 26157 Train: [73] [3700/6250] eta: 0:11:24 lr: 0.000022 grad: 0.1751 (0.2734) loss: 0.7862 (0.7856) time: 0.2668 data: 0.0002 max mem: 26157 Train: [73] [3800/6250] eta: 0:10:58 lr: 0.000022 grad: 0.1861 (0.2734) loss: 0.7781 (0.7854) time: 0.2669 data: 0.0001 max mem: 26157 Train: [73] [3900/6250] eta: 0:10:31 lr: 0.000022 grad: 0.1764 (0.2747) loss: 0.7836 (0.7853) time: 0.2667 data: 0.0001 max mem: 26157 Train: [73] [4000/6250] eta: 0:10:04 lr: 0.000022 grad: 0.1787 (0.2741) loss: 0.7756 (0.7853) time: 0.2662 data: 0.0001 max mem: 26157 Train: [73] [4100/6250] eta: 0:09:37 lr: 0.000022 grad: 0.1761 (0.2764) loss: 0.7873 (0.7853) time: 0.2659 data: 0.0001 max mem: 26157 Train: [73] [4200/6250] eta: 0:09:10 lr: 0.000022 grad: 0.1885 (0.2762) loss: 0.7805 (0.7852) time: 0.2666 data: 0.0001 max mem: 26157 Train: [73] [4300/6250] eta: 0:08:43 lr: 0.000022 grad: 0.1910 (0.2758) loss: 0.7823 (0.7851) time: 0.2660 data: 0.0001 max mem: 26157 Train: [73] [4400/6250] eta: 0:08:16 lr: 0.000022 grad: 0.1838 (0.2753) loss: 0.7823 (0.7851) time: 0.2670 data: 0.0002 max mem: 26157 Train: [73] [4500/6250] eta: 0:07:49 lr: 0.000022 grad: 0.1788 (0.2757) loss: 0.7713 (0.7850) time: 0.2675 data: 0.0001 max mem: 26157 Train: [73] [4600/6250] eta: 0:07:22 lr: 0.000022 grad: 0.1801 (0.2774) loss: 0.7834 (0.7850) time: 0.2669 data: 0.0001 max mem: 26157 Train: [73] [4700/6250] eta: 0:06:55 lr: 0.000022 grad: 0.1795 (0.2791) loss: 0.7718 (0.7850) time: 0.2674 data: 0.0001 max mem: 26157 Train: [73] [4800/6250] eta: 0:06:28 lr: 0.000022 grad: 0.1802 (0.2799) loss: 0.7805 (0.7849) time: 0.2670 data: 0.0001 max mem: 26157 Train: [73] [4900/6250] eta: 0:06:02 lr: 0.000022 grad: 0.1794 (0.2796) loss: 0.7834 (0.7847) time: 0.2671 data: 0.0002 max mem: 26157 Train: [73] [5000/6250] eta: 0:05:35 lr: 0.000022 grad: 0.1795 (0.2784) loss: 0.7756 (0.7846) time: 0.2680 data: 0.0001 max mem: 26157 Train: [73] [5100/6250] eta: 0:05:08 lr: 0.000022 grad: 0.1961 (0.2786) loss: 0.7757 (0.7844) time: 0.2663 data: 0.0001 max mem: 26157 Train: [73] [5200/6250] eta: 0:04:41 lr: 0.000022 grad: 0.1940 (0.2794) loss: 0.7669 (0.7842) time: 0.2676 data: 0.0002 max mem: 26157 Train: [73] [5300/6250] eta: 0:04:14 lr: 0.000022 grad: 0.1842 (0.2796) loss: 0.7741 (0.7840) time: 0.2678 data: 0.0002 max mem: 26157 Train: [73] [5400/6250] eta: 0:03:47 lr: 0.000022 grad: 0.1991 (0.2794) loss: 0.7743 (0.7837) time: 0.2663 data: 0.0001 max mem: 26157 Train: [73] [5500/6250] eta: 0:03:21 lr: 0.000022 grad: 0.1965 (0.2797) loss: 0.7713 (0.7836) time: 0.2664 data: 0.0001 max mem: 26157 Train: [73] [5600/6250] eta: 0:02:54 lr: 0.000022 grad: 0.1978 (0.2811) loss: 0.7671 (0.7834) time: 0.2672 data: 0.0002 max mem: 26157 Train: [73] [5700/6250] eta: 0:02:27 lr: 0.000022 grad: 0.1883 (0.2808) loss: 0.7604 (0.7832) time: 0.2671 data: 0.0001 max mem: 26157 Train: [73] [5800/6250] eta: 0:02:00 lr: 0.000022 grad: 0.1888 (0.2806) loss: 0.7691 (0.7830) time: 0.2673 data: 0.0002 max mem: 26157 Train: [73] [5900/6250] eta: 0:01:33 lr: 0.000022 grad: 0.1933 (0.2816) loss: 0.7718 (0.7828) time: 0.2668 data: 0.0001 max mem: 26157 Train: [73] [6000/6250] eta: 0:01:06 lr: 0.000022 grad: 0.1957 (0.2811) loss: 0.7761 (0.7826) time: 0.2671 data: 0.0001 max mem: 26157 Train: [73] [6100/6250] eta: 0:00:40 lr: 0.000022 grad: 0.2000 (0.2811) loss: 0.7667 (0.7825) time: 0.2674 data: 0.0002 max mem: 26157 Train: [73] [6200/6250] eta: 0:00:13 lr: 0.000022 grad: 0.2233 (0.2818) loss: 0.7616 (0.7823) time: 0.2668 data: 0.0001 max mem: 26157 Train: [73] [6249/6250] eta: 0:00:00 lr: 0.000022 grad: 0.2907 (0.2824) loss: 0.7661 (0.7822) time: 0.2672 data: 0.0001 max mem: 26157 Train: [73] Total time: 0:27:57 (0.2685 s / it) Averaged stats: lr: 0.000022 grad: 0.2907 (0.2824) loss: 0.7661 (0.7822) Eval (hcp-train-subset): [73] [ 0/62] eta: 0:04:27 loss: 0.8159 (0.8159) time: 4.3132 data: 4.2302 max mem: 26157 Eval (hcp-train-subset): [73] [61/62] eta: 0:00:00 loss: 0.8043 (0.8051) time: 0.1010 data: 0.0189 max mem: 26157 Eval (hcp-train-subset): [73] Total time: 0:00:10 (0.1715 s / it) Averaged stats (hcp-train-subset): loss: 0.8043 (0.8051) Making plots (hcp-train-subset): example=4 Eval (hcp-val): [73] [ 0/62] eta: 0:03:25 loss: 0.8192 (0.8192) time: 3.3115 data: 3.2027 max mem: 26157 Eval (hcp-val): [73] [61/62] eta: 0:00:00 loss: 0.8239 (0.8246) time: 0.0842 data: 0.0001 max mem: 26157 Eval (hcp-val): [73] Total time: 0:00:10 (0.1659 s / it) Averaged stats (hcp-val): loss: 0.8239 (0.8246) Making plots (hcp-val): example=51 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [74] [ 0/6250] eta: 7:01:28 lr: 0.000022 grad: 0.3829 (0.3829) loss: 0.8271 (0.8271) time: 4.0461 data: 3.7065 max mem: 26157 Train: [74] [ 100/6250] eta: 0:31:51 lr: 0.000022 grad: 0.2394 (0.3352) loss: 0.7811 (0.7897) time: 0.2670 data: 0.0001 max mem: 26157 Train: [74] [ 200/6250] eta: 0:29:09 lr: 0.000022 grad: 0.2055 (0.3072) loss: 0.7785 (0.7885) time: 0.2667 data: 0.0001 max mem: 26157 Train: [74] [ 300/6250] eta: 0:27:56 lr: 0.000022 grad: 0.2113 (0.3025) loss: 0.7840 (0.7873) time: 0.2667 data: 0.0002 max mem: 26157 Train: [74] [ 400/6250] eta: 0:27:08 lr: 0.000022 grad: 0.2036 (0.2904) loss: 0.7717 (0.7857) time: 0.2672 data: 0.0001 max mem: 26157 Train: [74] [ 500/6250] eta: 0:26:28 lr: 0.000022 grad: 0.1921 (0.3028) loss: 0.7797 (0.7856) time: 0.2682 data: 0.0001 max mem: 26157 Train: [74] [ 600/6250] eta: 0:25:52 lr: 0.000022 grad: 0.1741 (0.2845) loss: 0.7935 (0.7863) time: 0.2674 data: 0.0001 max mem: 26157 Train: [74] [ 700/6250] eta: 0:25:18 lr: 0.000022 grad: 0.1782 (0.2760) loss: 0.7895 (0.7869) time: 0.2666 data: 0.0002 max mem: 26157 Train: [74] [ 800/6250] eta: 0:24:47 lr: 0.000022 grad: 0.1771 (0.2763) loss: 0.7884 (0.7868) time: 0.2669 data: 0.0001 max mem: 26157 Train: [74] [ 900/6250] eta: 0:24:17 lr: 0.000021 grad: 0.1786 (0.2730) loss: 0.7806 (0.7864) time: 0.2679 data: 0.0001 max mem: 26157 Train: [74] [1000/6250] eta: 0:23:47 lr: 0.000021 grad: 0.1810 (0.2745) loss: 0.7758 (0.7858) time: 0.2667 data: 0.0001 max mem: 26157 Train: [74] [1100/6250] eta: 0:23:18 lr: 0.000021 grad: 0.1840 (0.2769) loss: 0.7803 (0.7854) time: 0.2669 data: 0.0002 max mem: 26157 Train: [74] [1200/6250] eta: 0:22:49 lr: 0.000021 grad: 0.1825 (0.2739) loss: 0.7774 (0.7850) time: 0.2670 data: 0.0001 max mem: 26157 Train: [74] [1300/6250] eta: 0:22:21 lr: 0.000021 grad: 0.1757 (0.2731) loss: 0.7728 (0.7846) time: 0.2672 data: 0.0001 max mem: 26157 Train: [74] [1400/6250] eta: 0:21:52 lr: 0.000021 grad: 0.1883 (0.2742) loss: 0.7774 (0.7841) time: 0.2676 data: 0.0001 max mem: 26157 Train: [74] [1500/6250] eta: 0:21:24 lr: 0.000021 grad: 0.1778 (0.2772) loss: 0.7837 (0.7839) time: 0.2667 data: 0.0001 max mem: 26157 Train: [74] [1600/6250] eta: 0:20:56 lr: 0.000021 grad: 0.1903 (0.2775) loss: 0.7797 (0.7836) time: 0.2668 data: 0.0001 max mem: 26157 Train: [74] [1700/6250] eta: 0:20:28 lr: 0.000021 grad: 0.1751 (0.2802) loss: 0.7865 (0.7833) time: 0.2664 data: 0.0002 max mem: 26157 Train: [74] [1800/6250] eta: 0:20:01 lr: 0.000021 grad: 0.1832 (0.2787) loss: 0.7664 (0.7826) time: 0.2669 data: 0.0001 max mem: 26157 Train: [74] [1900/6250] eta: 0:19:33 lr: 0.000021 grad: 0.1851 (0.2815) loss: 0.7790 (0.7824) time: 0.2687 data: 0.0002 max mem: 26157 Train: [74] [2000/6250] eta: 0:19:06 lr: 0.000021 grad: 0.1929 (0.2824) loss: 0.7724 (0.7820) time: 0.2695 data: 0.0002 max mem: 26157 Train: [74] [2100/6250] eta: 0:18:38 lr: 0.000021 grad: 0.1917 (0.2819) loss: 0.7780 (0.7819) time: 0.2671 data: 0.0001 max mem: 26157 Train: [74] [2200/6250] eta: 0:18:11 lr: 0.000021 grad: 0.1686 (0.2859) loss: 0.7867 (0.7818) time: 0.2669 data: 0.0001 max mem: 26157 Train: [74] [2300/6250] eta: 0:17:44 lr: 0.000021 grad: 0.1851 (0.2837) loss: 0.7828 (0.7818) time: 0.2689 data: 0.0002 max mem: 26157 Train: [74] [2400/6250] eta: 0:17:16 lr: 0.000021 grad: 0.1841 (0.2826) loss: 0.7832 (0.7819) time: 0.2668 data: 0.0001 max mem: 26157 Train: [74] [2500/6250] eta: 0:16:49 lr: 0.000021 grad: 0.1891 (0.2847) loss: 0.7825 (0.7818) time: 0.2666 data: 0.0001 max mem: 26157 Train: [74] [2600/6250] eta: 0:16:22 lr: 0.000021 grad: 0.1725 (0.2863) loss: 0.7786 (0.7818) time: 0.2670 data: 0.0001 max mem: 26157 Train: [74] [2700/6250] eta: 0:15:55 lr: 0.000021 grad: 0.1914 (0.2879) loss: 0.7853 (0.7818) time: 0.2673 data: 0.0001 max mem: 26157 Train: [74] [2800/6250] eta: 0:15:28 lr: 0.000021 grad: 0.1804 (0.2862) loss: 0.7797 (0.7818) time: 0.2668 data: 0.0001 max mem: 26157 Train: [74] [2900/6250] eta: 0:15:00 lr: 0.000021 grad: 0.1753 (0.2866) loss: 0.7832 (0.7818) time: 0.2678 data: 0.0002 max mem: 26157 Train: [74] [3000/6250] eta: 0:14:33 lr: 0.000021 grad: 0.1891 (0.2892) loss: 0.7833 (0.7818) time: 0.2677 data: 0.0001 max mem: 26157 Train: [74] [3100/6250] eta: 0:14:06 lr: 0.000021 grad: 0.1747 (0.2874) loss: 0.7830 (0.7818) time: 0.2671 data: 0.0001 max mem: 26157 Train: [74] [3200/6250] eta: 0:13:39 lr: 0.000021 grad: 0.2190 (0.2917) loss: 0.7752 (0.7817) time: 0.2668 data: 0.0001 max mem: 26157 Train: [74] [3300/6250] eta: 0:13:12 lr: 0.000021 grad: 0.1870 (0.2904) loss: 0.7809 (0.7817) time: 0.2667 data: 0.0001 max mem: 26157 Train: [74] [3400/6250] eta: 0:12:45 lr: 0.000021 grad: 0.1856 (0.2905) loss: 0.7879 (0.7817) time: 0.2680 data: 0.0002 max mem: 26157 Train: [74] [3500/6250] eta: 0:12:18 lr: 0.000021 grad: 0.1886 (0.2909) loss: 0.7816 (0.7817) time: 0.2678 data: 0.0001 max mem: 26157 Train: [74] [3600/6250] eta: 0:11:51 lr: 0.000021 grad: 0.1907 (0.2924) loss: 0.7838 (0.7817) time: 0.2674 data: 0.0001 max mem: 26157 Train: [74] [3700/6250] eta: 0:11:24 lr: 0.000021 grad: 0.1801 (0.2917) loss: 0.7811 (0.7816) time: 0.2677 data: 0.0002 max mem: 26157 Train: [74] [3800/6250] eta: 0:10:57 lr: 0.000021 grad: 0.1917 (0.2930) loss: 0.7821 (0.7816) time: 0.2668 data: 0.0001 max mem: 26157 Train: [74] [3900/6250] eta: 0:10:31 lr: 0.000021 grad: 0.1877 (0.2967) loss: 0.7795 (0.7816) time: 0.2681 data: 0.0002 max mem: 26157 Train: [74] [4000/6250] eta: 0:10:04 lr: 0.000021 grad: 0.1951 (0.2983) loss: 0.7724 (0.7815) time: 0.2672 data: 0.0001 max mem: 26157 Train: [74] [4100/6250] eta: 0:09:37 lr: 0.000021 grad: 0.1774 (0.2981) loss: 0.7751 (0.7814) time: 0.2673 data: 0.0001 max mem: 26157 Train: [74] [4200/6250] eta: 0:09:10 lr: 0.000021 grad: 0.1908 (0.2988) loss: 0.7773 (0.7814) time: 0.2668 data: 0.0001 max mem: 26157 Train: [74] [4300/6250] eta: 0:08:43 lr: 0.000021 grad: 0.1879 (0.2999) loss: 0.7788 (0.7813) time: 0.2677 data: 0.0001 max mem: 26157 Train: [74] [4400/6250] eta: 0:08:16 lr: 0.000021 grad: 0.1893 (0.3031) loss: 0.7787 (0.7812) time: 0.2679 data: 0.0002 max mem: 26157 Train: [74] [4500/6250] eta: 0:07:49 lr: 0.000021 grad: 0.1882 (0.3026) loss: 0.7778 (0.7811) time: 0.2675 data: 0.0001 max mem: 26157 Train: [74] [4600/6250] eta: 0:07:22 lr: 0.000021 grad: 0.1887 (0.3027) loss: 0.7805 (0.7811) time: 0.2670 data: 0.0001 max mem: 26157 Train: [74] [4700/6250] eta: 0:06:55 lr: 0.000021 grad: 0.1937 (0.3048) loss: 0.7812 (0.7810) time: 0.2676 data: 0.0001 max mem: 26157 Train: [74] [4800/6250] eta: 0:06:29 lr: 0.000021 grad: 0.1826 (0.3040) loss: 0.7753 (0.7810) time: 0.2666 data: 0.0001 max mem: 26157 Train: [74] [4900/6250] eta: 0:06:02 lr: 0.000020 grad: 0.1892 (0.3037) loss: 0.7765 (0.7809) time: 0.2668 data: 0.0001 max mem: 26157 Train: [74] [5000/6250] eta: 0:05:35 lr: 0.000020 grad: 0.1939 (0.3024) loss: 0.7731 (0.7809) time: 0.2675 data: 0.0001 max mem: 26157 Train: [74] [5100/6250] eta: 0:05:08 lr: 0.000020 grad: 0.1937 (0.3052) loss: 0.7816 (0.7808) time: 0.2677 data: 0.0002 max mem: 26157 Train: [74] [5200/6250] eta: 0:04:41 lr: 0.000020 grad: 0.1777 (0.3087) loss: 0.7827 (0.7809) time: 0.2671 data: 0.0001 max mem: 26157 Train: [74] [5300/6250] eta: 0:04:14 lr: 0.000020 grad: 0.1952 (0.3101) loss: 0.7889 (0.7809) time: 0.2676 data: 0.0001 max mem: 26157 Train: [74] [5400/6250] eta: 0:03:47 lr: 0.000020 grad: 0.1843 (0.3087) loss: 0.7799 (0.7809) time: 0.2662 data: 0.0001 max mem: 26157 Train: [74] [5500/6250] eta: 0:03:21 lr: 0.000020 grad: 0.1790 (0.3090) loss: 0.7834 (0.7809) time: 0.2671 data: 0.0002 max mem: 26157 Train: [74] [5600/6250] eta: 0:02:54 lr: 0.000020 grad: 0.1790 (0.3094) loss: 0.7769 (0.7808) time: 0.2677 data: 0.0002 max mem: 26157 Train: [74] [5700/6250] eta: 0:02:27 lr: 0.000020 grad: 0.1838 (0.3081) loss: 0.7757 (0.7809) time: 0.2677 data: 0.0002 max mem: 26157 Train: [74] [5800/6250] eta: 0:02:00 lr: 0.000020 grad: 0.1819 (0.3074) loss: 0.7818 (0.7809) time: 0.2677 data: 0.0001 max mem: 26157 Train: [74] [5900/6250] eta: 0:01:33 lr: 0.000020 grad: 0.2214 (0.3083) loss: 0.7824 (0.7809) time: 0.2672 data: 0.0001 max mem: 26157 Train: [74] [6000/6250] eta: 0:01:07 lr: 0.000020 grad: 0.1810 (0.3072) loss: 0.7778 (0.7809) time: 0.2664 data: 0.0001 max mem: 26157 Train: [74] [6100/6250] eta: 0:00:40 lr: 0.000020 grad: 0.1809 (0.3076) loss: 0.7861 (0.7808) time: 0.2680 data: 0.0001 max mem: 26157 Train: [74] [6200/6250] eta: 0:00:13 lr: 0.000020 grad: 0.1862 (0.3075) loss: 0.7782 (0.7808) time: 0.2668 data: 0.0001 max mem: 26157 Train: [74] [6249/6250] eta: 0:00:00 lr: 0.000020 grad: 0.1893 (0.3083) loss: 0.7839 (0.7808) time: 0.2674 data: 0.0001 max mem: 26157 Train: [74] Total time: 0:27:58 (0.2686 s / it) Averaged stats: lr: 0.000020 grad: 0.1893 (0.3083) loss: 0.7839 (0.7808) Eval (hcp-train-subset): [74] [ 0/62] eta: 0:04:11 loss: 0.8126 (0.8126) time: 4.0593 data: 3.9765 max mem: 26157 Eval (hcp-train-subset): [74] [61/62] eta: 0:00:00 loss: 0.8005 (0.8044) time: 0.0935 data: 0.0110 max mem: 26157 Eval (hcp-train-subset): [74] Total time: 0:00:10 (0.1652 s / it) Averaged stats (hcp-train-subset): loss: 0.8005 (0.8044) Making plots (hcp-train-subset): example=2 Eval (hcp-val): [74] [ 0/62] eta: 0:03:55 loss: 0.8210 (0.8210) time: 3.7935 data: 3.7118 max mem: 26157 Eval (hcp-val): [74] [61/62] eta: 0:00:00 loss: 0.8233 (0.8242) time: 0.0984 data: 0.0143 max mem: 26157 Eval (hcp-val): [74] Total time: 0:00:10 (0.1748 s / it) Averaged stats (hcp-val): loss: 0.8233 (0.8242) Making plots (hcp-val): example=50 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [75] [ 0/6250] eta: 8:57:39 lr: 0.000020 grad: 0.1332 (0.1332) loss: 0.8505 (0.8505) time: 5.1616 data: 4.8855 max mem: 26157 Train: [75] [ 100/6250] eta: 0:32:32 lr: 0.000020 grad: 0.2018 (0.2827) loss: 0.8025 (0.8009) time: 0.2677 data: 0.0002 max mem: 26157 Train: [75] [ 200/6250] eta: 0:29:31 lr: 0.000020 grad: 0.2051 (0.2626) loss: 0.7908 (0.7970) time: 0.2680 data: 0.0001 max mem: 26157 Train: [75] [ 300/6250] eta: 0:28:11 lr: 0.000020 grad: 0.1901 (0.2707) loss: 0.7926 (0.7950) time: 0.2666 data: 0.0002 max mem: 26157 Train: [75] [ 400/6250] eta: 0:27:18 lr: 0.000020 grad: 0.1896 (0.2804) loss: 0.8007 (0.7943) time: 0.2679 data: 0.0001 max mem: 26157 Train: [75] [ 500/6250] eta: 0:26:36 lr: 0.000020 grad: 0.1946 (0.2692) loss: 0.7982 (0.7936) time: 0.2677 data: 0.0002 max mem: 26157 Train: [75] [ 600/6250] eta: 0:25:58 lr: 0.000020 grad: 0.2144 (0.2705) loss: 0.7951 (0.7926) time: 0.2677 data: 0.0001 max mem: 26157 Train: [75] [ 700/6250] eta: 0:25:24 lr: 0.000020 grad: 0.1804 (0.2904) loss: 0.7861 (0.7920) time: 0.2670 data: 0.0001 max mem: 26157 Train: [75] [ 800/6250] eta: 0:24:51 lr: 0.000020 grad: 0.1875 (0.2885) loss: 0.7856 (0.7914) time: 0.2684 data: 0.0002 max mem: 26157 Train: [75] [ 900/6250] eta: 0:24:20 lr: 0.000020 grad: 0.2083 (0.2942) loss: 0.7876 (0.7909) time: 0.2683 data: 0.0002 max mem: 26157 Train: [75] [1000/6250] eta: 0:23:50 lr: 0.000020 grad: 0.1912 (0.2901) loss: 0.7823 (0.7904) time: 0.2656 data: 0.0001 max mem: 26157 Train: [75] [1100/6250] eta: 0:23:20 lr: 0.000020 grad: 0.1787 (0.2878) loss: 0.7931 (0.7898) time: 0.2668 data: 0.0001 max mem: 26157 Train: [75] [1200/6250] eta: 0:22:51 lr: 0.000020 grad: 0.1769 (0.2819) loss: 0.7817 (0.7892) time: 0.2665 data: 0.0001 max mem: 26157 Train: [75] [1300/6250] eta: 0:22:22 lr: 0.000020 grad: 0.1741 (0.2777) loss: 0.7786 (0.7886) time: 0.2663 data: 0.0001 max mem: 26157 Train: [75] [1400/6250] eta: 0:21:53 lr: 0.000020 grad: 0.1940 (0.2759) loss: 0.7766 (0.7882) time: 0.2670 data: 0.0002 max mem: 26157 Train: [75] [1500/6250] eta: 0:21:25 lr: 0.000020 grad: 0.1782 (0.2770) loss: 0.7830 (0.7879) time: 0.2667 data: 0.0002 max mem: 26157 Train: [75] [1600/6250] eta: 0:20:57 lr: 0.000020 grad: 0.1864 (0.2759) loss: 0.7818 (0.7875) time: 0.2677 data: 0.0001 max mem: 26157 Train: [75] [1700/6250] eta: 0:20:29 lr: 0.000020 grad: 0.1714 (0.2716) loss: 0.7962 (0.7873) time: 0.2662 data: 0.0001 max mem: 26157 Train: [75] [1800/6250] eta: 0:20:01 lr: 0.000020 grad: 0.1747 (0.2707) loss: 0.7807 (0.7871) time: 0.2674 data: 0.0001 max mem: 26157 Train: [75] [1900/6250] eta: 0:19:33 lr: 0.000020 grad: 0.1745 (0.2701) loss: 0.7862 (0.7869) time: 0.2665 data: 0.0001 max mem: 26157 Train: [75] [2000/6250] eta: 0:19:06 lr: 0.000020 grad: 0.1767 (0.2749) loss: 0.7809 (0.7866) time: 0.2678 data: 0.0002 max mem: 26157 Train: [75] [2100/6250] eta: 0:18:38 lr: 0.000020 grad: 0.1879 (0.2764) loss: 0.7846 (0.7865) time: 0.2667 data: 0.0001 max mem: 26157 Train: [75] [2200/6250] eta: 0:18:11 lr: 0.000020 grad: 0.1746 (0.2747) loss: 0.7851 (0.7863) time: 0.2676 data: 0.0001 max mem: 26157 Train: [75] [2300/6250] eta: 0:17:43 lr: 0.000020 grad: 0.1831 (0.2726) loss: 0.7819 (0.7860) time: 0.2660 data: 0.0001 max mem: 26157 Train: [75] [2400/6250] eta: 0:17:16 lr: 0.000020 grad: 0.1953 (0.2713) loss: 0.7769 (0.7857) time: 0.2677 data: 0.0002 max mem: 26157 Train: [75] [2500/6250] eta: 0:16:49 lr: 0.000020 grad: 0.1980 (0.2699) loss: 0.7844 (0.7854) time: 0.2670 data: 0.0002 max mem: 26157 Train: [75] [2600/6250] eta: 0:16:22 lr: 0.000020 grad: 0.1931 (0.2713) loss: 0.7765 (0.7851) time: 0.2673 data: 0.0001 max mem: 26157 Train: [75] [2700/6250] eta: 0:15:55 lr: 0.000020 grad: 0.1705 (0.2689) loss: 0.7802 (0.7849) time: 0.2678 data: 0.0001 max mem: 26157 Train: [75] [2800/6250] eta: 0:15:28 lr: 0.000019 grad: 0.1825 (0.2674) loss: 0.7755 (0.7846) time: 0.2675 data: 0.0001 max mem: 26157 Train: [75] [2900/6250] eta: 0:15:00 lr: 0.000019 grad: 0.1853 (0.2730) loss: 0.7747 (0.7843) time: 0.2680 data: 0.0001 max mem: 26157 Train: [75] [3000/6250] eta: 0:14:33 lr: 0.000019 grad: 0.1798 (0.2731) loss: 0.7706 (0.7841) time: 0.2666 data: 0.0001 max mem: 26157 Train: [75] [3100/6250] eta: 0:14:06 lr: 0.000019 grad: 0.1910 (0.2739) loss: 0.7807 (0.7839) time: 0.2669 data: 0.0001 max mem: 26157 Train: [75] [3200/6250] eta: 0:13:39 lr: 0.000019 grad: 0.1864 (0.2737) loss: 0.7742 (0.7837) time: 0.2679 data: 0.0001 max mem: 26157 Train: [75] [3300/6250] eta: 0:13:12 lr: 0.000019 grad: 0.1966 (0.2762) loss: 0.7758 (0.7835) time: 0.2667 data: 0.0001 max mem: 26157 Train: [75] [3400/6250] eta: 0:12:45 lr: 0.000019 grad: 0.2086 (0.2775) loss: 0.7743 (0.7834) time: 0.2668 data: 0.0001 max mem: 26157 Train: [75] [3500/6250] eta: 0:12:18 lr: 0.000019 grad: 0.1862 (0.2764) loss: 0.7674 (0.7832) time: 0.2673 data: 0.0001 max mem: 26157 Train: [75] [3600/6250] eta: 0:11:51 lr: 0.000019 grad: 0.1882 (0.2776) loss: 0.7799 (0.7830) time: 0.2669 data: 0.0001 max mem: 26157 Train: [75] [3700/6250] eta: 0:11:24 lr: 0.000019 grad: 0.1914 (0.2803) loss: 0.7823 (0.7828) time: 0.2675 data: 0.0001 max mem: 26157 Train: [75] [3800/6250] eta: 0:10:57 lr: 0.000019 grad: 0.1971 (0.2819) loss: 0.7734 (0.7827) time: 0.2669 data: 0.0002 max mem: 26157 Train: [75] [3900/6250] eta: 0:10:30 lr: 0.000019 grad: 0.1865 (0.2821) loss: 0.7828 (0.7825) time: 0.2666 data: 0.0002 max mem: 26157 Train: [75] [4000/6250] eta: 0:10:03 lr: 0.000019 grad: 0.1902 (0.2829) loss: 0.7787 (0.7824) time: 0.2675 data: 0.0001 max mem: 26157 Train: [75] [4100/6250] eta: 0:09:37 lr: 0.000019 grad: 0.1857 (0.2816) loss: 0.7819 (0.7824) time: 0.2667 data: 0.0002 max mem: 26157 Train: [75] [4200/6250] eta: 0:09:10 lr: 0.000019 grad: 0.1889 (0.2834) loss: 0.7776 (0.7824) time: 0.2667 data: 0.0001 max mem: 26157 Train: [75] [4300/6250] eta: 0:08:43 lr: 0.000019 grad: 0.1795 (0.2872) loss: 0.7857 (0.7824) time: 0.2668 data: 0.0001 max mem: 26157 Train: [75] [4400/6250] eta: 0:08:16 lr: 0.000019 grad: 0.1732 (0.2901) loss: 0.7931 (0.7824) time: 0.2682 data: 0.0002 max mem: 26157 Train: [75] [4500/6250] eta: 0:07:49 lr: 0.000019 grad: 0.1936 (0.2910) loss: 0.7867 (0.7824) time: 0.2673 data: 0.0001 max mem: 26157 Train: [75] [4600/6250] eta: 0:07:22 lr: 0.000019 grad: 0.1899 (0.2934) loss: 0.7819 (0.7824) time: 0.2687 data: 0.0002 max mem: 26157 Train: [75] [4700/6250] eta: 0:06:55 lr: 0.000019 grad: 0.1848 (0.2935) loss: 0.7755 (0.7824) time: 0.2666 data: 0.0001 max mem: 26157 Train: [75] [4800/6250] eta: 0:06:28 lr: 0.000019 grad: 0.1985 (0.2938) loss: 0.7834 (0.7824) time: 0.2675 data: 0.0001 max mem: 26157 Train: [75] [4900/6250] eta: 0:06:02 lr: 0.000019 grad: 0.1955 (0.2960) loss: 0.7879 (0.7824) time: 0.2669 data: 0.0001 max mem: 26157 Train: [75] [5000/6250] eta: 0:05:35 lr: 0.000019 grad: 0.1893 (0.2976) loss: 0.7857 (0.7824) time: 0.2682 data: 0.0002 max mem: 26157 Train: [75] [5100/6250] eta: 0:05:08 lr: 0.000019 grad: 0.1981 (0.2982) loss: 0.7806 (0.7823) time: 0.2675 data: 0.0001 max mem: 26157 Train: [75] [5200/6250] eta: 0:04:41 lr: 0.000019 grad: 0.2295 (0.2994) loss: 0.7793 (0.7822) time: 0.2695 data: 0.0002 max mem: 26157 Train: [75] [5300/6250] eta: 0:04:14 lr: 0.000019 grad: 0.1885 (0.2985) loss: 0.7835 (0.7822) time: 0.2675 data: 0.0001 max mem: 26157 Train: [75] [5400/6250] eta: 0:03:47 lr: 0.000019 grad: 0.1975 (0.2982) loss: 0.7796 (0.7821) time: 0.2665 data: 0.0001 max mem: 26157 Train: [75] [5500/6250] eta: 0:03:21 lr: 0.000019 grad: 0.1956 (0.2983) loss: 0.7821 (0.7820) time: 0.2667 data: 0.0001 max mem: 26157 Train: [75] [5600/6250] eta: 0:02:54 lr: 0.000019 grad: 0.1933 (0.2993) loss: 0.7789 (0.7819) time: 0.2667 data: 0.0001 max mem: 26157 Train: [75] [5700/6250] eta: 0:02:27 lr: 0.000019 grad: 0.1905 (0.2986) loss: 0.7750 (0.7818) time: 0.2667 data: 0.0001 max mem: 26157 Train: [75] [5800/6250] eta: 0:02:00 lr: 0.000019 grad: 0.1916 (0.3016) loss: 0.7780 (0.7817) time: 0.2668 data: 0.0001 max mem: 26157 Train: [75] [5900/6250] eta: 0:01:33 lr: 0.000019 grad: 0.1911 (0.3020) loss: 0.7793 (0.7817) time: 0.2671 data: 0.0001 max mem: 26157 Train: [75] [6000/6250] eta: 0:01:07 lr: 0.000019 grad: 0.3313 (0.3047) loss: 0.7763 (0.7816) time: 0.2661 data: 0.0001 max mem: 26157 Train: [75] [6100/6250] eta: 0:00:40 lr: 0.000019 grad: 0.1850 (0.3048) loss: 0.7766 (0.7816) time: 0.2667 data: 0.0002 max mem: 26157 Train: [75] [6200/6250] eta: 0:00:13 lr: 0.000019 grad: 0.1833 (0.3047) loss: 0.7756 (0.7815) time: 0.2672 data: 0.0002 max mem: 26157 Train: [75] [6249/6250] eta: 0:00:00 lr: 0.000019 grad: 0.1865 (0.3046) loss: 0.7758 (0.7814) time: 0.2664 data: 0.0001 max mem: 26157 Train: [75] Total time: 0:27:58 (0.2685 s / it) Averaged stats: lr: 0.000019 grad: 0.1865 (0.3046) loss: 0.7758 (0.7814) Eval (hcp-train-subset): [75] [ 0/62] eta: 0:03:08 loss: 0.8117 (0.8117) time: 3.0435 data: 2.9294 max mem: 26157 Eval (hcp-train-subset): [75] [61/62] eta: 0:00:00 loss: 0.7967 (0.8026) time: 0.1144 data: 0.0322 max mem: 26157 Eval (hcp-train-subset): [75] Total time: 0:00:11 (0.1880 s / it) Averaged stats (hcp-train-subset): loss: 0.7967 (0.8026) Making plots (hcp-train-subset): example=2 Eval (hcp-val): [75] [ 0/62] eta: 0:05:19 loss: 0.8192 (0.8192) time: 5.1483 data: 5.0657 max mem: 26157 Eval (hcp-val): [75] [61/62] eta: 0:00:00 loss: 0.8232 (0.8248) time: 0.0824 data: 0.0001 max mem: 26157 Eval (hcp-val): [75] Total time: 0:00:11 (0.1782 s / it) Averaged stats (hcp-val): loss: 0.8232 (0.8248) Making plots (hcp-val): example=6 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [76] [ 0/6250] eta: 6:40:21 lr: 0.000019 grad: 0.1271 (0.1271) loss: 0.8582 (0.8582) time: 3.8435 data: 3.5087 max mem: 26157 Train: [76] [ 100/6250] eta: 0:31:47 lr: 0.000019 grad: 0.2031 (0.2939) loss: 0.8036 (0.8008) time: 0.2663 data: 0.0001 max mem: 26157 Train: [76] [ 200/6250] eta: 0:29:06 lr: 0.000019 grad: 0.2027 (0.2785) loss: 0.7870 (0.7984) time: 0.2667 data: 0.0001 max mem: 26157 Train: [76] [ 300/6250] eta: 0:27:56 lr: 0.000019 grad: 0.1983 (0.3179) loss: 0.7846 (0.7950) time: 0.2680 data: 0.0001 max mem: 26157 Train: [76] [ 400/6250] eta: 0:27:06 lr: 0.000019 grad: 0.1833 (0.2980) loss: 0.7894 (0.7933) time: 0.2679 data: 0.0001 max mem: 26157 Train: [76] [ 500/6250] eta: 0:26:26 lr: 0.000019 grad: 0.1944 (0.2897) loss: 0.7829 (0.7916) time: 0.2681 data: 0.0002 max mem: 26157 Train: [76] [ 600/6250] eta: 0:25:51 lr: 0.000019 grad: 0.1919 (0.2817) loss: 0.7879 (0.7911) time: 0.2678 data: 0.0001 max mem: 26157 Train: [76] [ 700/6250] eta: 0:25:18 lr: 0.000019 grad: 0.1779 (0.2784) loss: 0.7986 (0.7906) time: 0.2671 data: 0.0001 max mem: 26157 Train: [76] [ 800/6250] eta: 0:24:46 lr: 0.000018 grad: 0.1717 (0.2778) loss: 0.7927 (0.7909) time: 0.2687 data: 0.0002 max mem: 26157 Train: [76] [ 900/6250] eta: 0:24:15 lr: 0.000018 grad: 0.1721 (0.2761) loss: 0.7877 (0.7906) time: 0.2662 data: 0.0001 max mem: 26157 Train: [76] [1000/6250] eta: 0:23:46 lr: 0.000018 grad: 0.1813 (0.2916) loss: 0.7900 (0.7904) time: 0.2677 data: 0.0001 max mem: 26157 Train: [76] [1100/6250] eta: 0:23:17 lr: 0.000018 grad: 0.1903 (0.2923) loss: 0.7819 (0.7903) time: 0.2673 data: 0.0002 max mem: 26157 Train: [76] [1200/6250] eta: 0:22:48 lr: 0.000018 grad: 0.1792 (0.2881) loss: 0.7862 (0.7901) time: 0.2673 data: 0.0001 max mem: 26157 Train: [76] [1300/6250] eta: 0:22:19 lr: 0.000018 grad: 0.1823 (0.2838) loss: 0.7800 (0.7897) time: 0.2667 data: 0.0001 max mem: 26157 Train: [76] [1400/6250] eta: 0:21:51 lr: 0.000018 grad: 0.1715 (0.2888) loss: 0.7940 (0.7893) time: 0.2667 data: 0.0002 max mem: 26157 Train: [76] [1500/6250] eta: 0:21:22 lr: 0.000018 grad: 0.1751 (0.2863) loss: 0.7822 (0.7888) time: 0.2670 data: 0.0001 max mem: 26157 Train: [76] [1600/6250] eta: 0:20:54 lr: 0.000018 grad: 0.1715 (0.2840) loss: 0.7828 (0.7885) time: 0.2665 data: 0.0001 max mem: 26157 Train: [76] [1700/6250] eta: 0:20:27 lr: 0.000018 grad: 0.1809 (0.2922) loss: 0.7894 (0.7883) time: 0.2664 data: 0.0002 max mem: 26157 Train: [76] [1800/6250] eta: 0:19:59 lr: 0.000018 grad: 0.1797 (0.2883) loss: 0.7881 (0.7880) time: 0.2672 data: 0.0002 max mem: 26157 Train: [76] [1900/6250] eta: 0:19:31 lr: 0.000018 grad: 0.1741 (0.2856) loss: 0.7811 (0.7879) time: 0.2663 data: 0.0002 max mem: 26157 Train: [76] [2000/6250] eta: 0:19:04 lr: 0.000018 grad: 0.1778 (0.2849) loss: 0.7791 (0.7877) time: 0.2673 data: 0.0001 max mem: 26157 Train: [76] [2100/6250] eta: 0:18:36 lr: 0.000018 grad: 0.1746 (0.2829) loss: 0.7859 (0.7875) time: 0.2665 data: 0.0001 max mem: 26157 Train: [76] [2200/6250] eta: 0:18:09 lr: 0.000018 grad: 0.1859 (0.2802) loss: 0.7881 (0.7874) time: 0.2667 data: 0.0001 max mem: 26157 Train: [76] [2300/6250] eta: 0:17:42 lr: 0.000018 grad: 0.1766 (0.2828) loss: 0.7821 (0.7874) time: 0.2671 data: 0.0001 max mem: 26157 Train: [76] [2400/6250] eta: 0:17:15 lr: 0.000018 grad: 0.1955 (0.2872) loss: 0.7768 (0.7873) time: 0.2662 data: 0.0001 max mem: 26157 Train: [76] [2500/6250] eta: 0:16:47 lr: 0.000018 grad: 0.1899 (0.2863) loss: 0.7854 (0.7871) time: 0.2674 data: 0.0001 max mem: 26157 Train: [76] [2600/6250] eta: 0:16:20 lr: 0.000018 grad: 0.1911 (0.2852) loss: 0.7884 (0.7871) time: 0.2665 data: 0.0001 max mem: 26157 Train: [76] [2700/6250] eta: 0:15:53 lr: 0.000018 grad: 0.1818 (0.2841) loss: 0.7806 (0.7869) time: 0.2674 data: 0.0001 max mem: 26157 Train: [76] [2800/6250] eta: 0:15:26 lr: 0.000018 grad: 0.1807 (0.2867) loss: 0.7797 (0.7867) time: 0.2680 data: 0.0001 max mem: 26157 Train: [76] [2900/6250] eta: 0:14:59 lr: 0.000018 grad: 0.1877 (0.2885) loss: 0.7816 (0.7865) time: 0.2668 data: 0.0001 max mem: 26157 Train: [76] [3000/6250] eta: 0:14:32 lr: 0.000018 grad: 0.1807 (0.2867) loss: 0.7771 (0.7863) time: 0.2674 data: 0.0001 max mem: 26157 Train: [76] [3100/6250] eta: 0:14:05 lr: 0.000018 grad: 0.1818 (0.2906) loss: 0.7745 (0.7859) time: 0.2669 data: 0.0001 max mem: 26157 Train: [76] [3200/6250] eta: 0:13:38 lr: 0.000018 grad: 0.1888 (0.2910) loss: 0.7852 (0.7857) time: 0.2665 data: 0.0002 max mem: 26157 Train: [76] [3300/6250] eta: 0:13:11 lr: 0.000018 grad: 0.1898 (0.2925) loss: 0.7863 (0.7855) time: 0.2671 data: 0.0001 max mem: 26157 Train: [76] [3400/6250] eta: 0:12:44 lr: 0.000018 grad: 0.2022 (0.2937) loss: 0.7711 (0.7852) time: 0.2681 data: 0.0001 max mem: 26157 Train: [76] [3500/6250] eta: 0:12:17 lr: 0.000018 grad: 0.2094 (0.2939) loss: 0.7832 (0.7850) time: 0.2674 data: 0.0001 max mem: 26157 Train: [76] [3600/6250] eta: 0:11:50 lr: 0.000018 grad: 0.2005 (0.2939) loss: 0.7798 (0.7849) time: 0.2662 data: 0.0001 max mem: 26157 Train: [76] [3700/6250] eta: 0:11:24 lr: 0.000018 grad: 0.1856 (0.2944) loss: 0.7854 (0.7848) time: 0.2671 data: 0.0002 max mem: 26157 Train: [76] [3800/6250] eta: 0:10:57 lr: 0.000018 grad: 0.1794 (0.2936) loss: 0.7808 (0.7847) time: 0.2670 data: 0.0001 max mem: 26157 Train: [76] [3900/6250] eta: 0:10:30 lr: 0.000018 grad: 0.1825 (0.2933) loss: 0.7788 (0.7846) time: 0.2666 data: 0.0001 max mem: 26157 Train: [76] [4000/6250] eta: 0:10:03 lr: 0.000018 grad: 0.1927 (0.2975) loss: 0.7849 (0.7845) time: 0.2678 data: 0.0002 max mem: 26157 Train: [76] [4100/6250] eta: 0:09:36 lr: 0.000018 grad: 0.2010 (0.2988) loss: 0.7876 (0.7845) time: 0.2678 data: 0.0001 max mem: 26157 Train: [76] [4200/6250] eta: 0:09:09 lr: 0.000018 grad: 0.1850 (0.2992) loss: 0.7847 (0.7844) time: 0.2662 data: 0.0001 max mem: 26157 Train: [76] [4300/6250] eta: 0:08:42 lr: 0.000018 grad: 0.1867 (0.3004) loss: 0.7822 (0.7843) time: 0.2667 data: 0.0001 max mem: 26157 Train: [76] [4400/6250] eta: 0:08:15 lr: 0.000018 grad: 0.1974 (0.3002) loss: 0.7768 (0.7843) time: 0.2677 data: 0.0002 max mem: 26157 Train: [76] [4500/6250] eta: 0:07:49 lr: 0.000018 grad: 0.2031 (0.3012) loss: 0.7725 (0.7842) time: 0.2672 data: 0.0001 max mem: 26157 Train: [76] [4600/6250] eta: 0:07:22 lr: 0.000018 grad: 0.1938 (0.3020) loss: 0.7745 (0.7841) time: 0.2670 data: 0.0001 max mem: 26157 Train: [76] [4700/6250] eta: 0:06:55 lr: 0.000018 grad: 0.1860 (0.3030) loss: 0.7740 (0.7839) time: 0.2673 data: 0.0001 max mem: 26157 Train: [76] [4800/6250] eta: 0:06:28 lr: 0.000018 grad: 0.1901 (0.3030) loss: 0.7798 (0.7838) time: 0.2661 data: 0.0001 max mem: 26157 Train: [76] [4900/6250] eta: 0:06:01 lr: 0.000018 grad: 0.1944 (0.3039) loss: 0.7788 (0.7837) time: 0.2670 data: 0.0001 max mem: 26157 Train: [76] [5000/6250] eta: 0:05:34 lr: 0.000018 grad: 0.1994 (0.3047) loss: 0.7829 (0.7837) time: 0.2676 data: 0.0001 max mem: 26157 Train: [76] [5100/6250] eta: 0:05:08 lr: 0.000017 grad: 0.1940 (0.3044) loss: 0.7812 (0.7836) time: 0.2669 data: 0.0001 max mem: 26157 Train: [76] [5200/6250] eta: 0:04:41 lr: 0.000017 grad: 0.1895 (0.3055) loss: 0.7683 (0.7835) time: 0.2671 data: 0.0002 max mem: 26157 Train: [76] [5300/6250] eta: 0:04:14 lr: 0.000017 grad: 0.2170 (0.3094) loss: 0.7843 (0.7834) time: 0.2664 data: 0.0001 max mem: 26157 Train: [76] [5400/6250] eta: 0:03:47 lr: 0.000017 grad: 0.2148 (0.3108) loss: 0.7800 (0.7833) time: 0.2660 data: 0.0001 max mem: 26157 Train: [76] [5500/6250] eta: 0:03:20 lr: 0.000017 grad: 0.2098 (0.3128) loss: 0.7768 (0.7832) time: 0.2671 data: 0.0001 max mem: 26157 Train: [76] [5600/6250] eta: 0:02:54 lr: 0.000017 grad: 0.1920 (0.3149) loss: 0.7793 (0.7831) time: 0.2668 data: 0.0001 max mem: 26157 Train: [76] [5700/6250] eta: 0:02:27 lr: 0.000017 grad: 0.2018 (0.3185) loss: 0.7762 (0.7831) time: 0.2666 data: 0.0001 max mem: 26157 Train: [76] [5800/6250] eta: 0:02:00 lr: 0.000017 grad: 0.2022 (0.3181) loss: 0.7748 (0.7829) time: 0.2670 data: 0.0001 max mem: 26157 Train: [76] [5900/6250] eta: 0:01:33 lr: 0.000017 grad: 0.1875 (0.3184) loss: 0.7762 (0.7828) time: 0.2670 data: 0.0001 max mem: 26157 Train: [76] [6000/6250] eta: 0:01:06 lr: 0.000017 grad: 0.1847 (0.3196) loss: 0.7810 (0.7828) time: 0.2665 data: 0.0001 max mem: 26157 Train: [76] [6100/6250] eta: 0:00:40 lr: 0.000017 grad: 0.2089 (0.3196) loss: 0.7765 (0.7827) time: 0.2671 data: 0.0001 max mem: 26157 Train: [76] [6200/6250] eta: 0:00:13 lr: 0.000017 grad: 0.1794 (0.3184) loss: 0.7841 (0.7827) time: 0.2669 data: 0.0001 max mem: 26157 Train: [76] [6249/6250] eta: 0:00:00 lr: 0.000017 grad: 0.1854 (0.3176) loss: 0.7766 (0.7827) time: 0.2667 data: 0.0001 max mem: 26157 Train: [76] Total time: 0:27:56 (0.2683 s / it) Averaged stats: lr: 0.000017 grad: 0.1854 (0.3176) loss: 0.7766 (0.7827) Eval (hcp-train-subset): [76] [ 0/62] eta: 0:04:31 loss: 0.8060 (0.8060) time: 4.3813 data: 4.2994 max mem: 26157 Eval (hcp-train-subset): [76] [61/62] eta: 0:00:00 loss: 0.7982 (0.8030) time: 0.1110 data: 0.0287 max mem: 26157 Eval (hcp-train-subset): [76] Total time: 0:00:10 (0.1667 s / it) Averaged stats (hcp-train-subset): loss: 0.7982 (0.8030) Making plots (hcp-train-subset): example=35 Eval (hcp-val): [76] [ 0/62] eta: 0:03:22 loss: 0.8241 (0.8241) time: 3.2631 data: 3.1684 max mem: 26157 Eval (hcp-val): [76] [61/62] eta: 0:00:00 loss: 0.8234 (0.8244) time: 0.0909 data: 0.0089 max mem: 26157 Eval (hcp-val): [76] Total time: 0:00:10 (0.1670 s / it) Averaged stats (hcp-val): loss: 0.8234 (0.8244) Making plots (hcp-val): example=2 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [77] [ 0/6250] eta: 7:54:06 lr: 0.000017 grad: 0.3645 (0.3645) loss: 0.7316 (0.7316) time: 4.5515 data: 4.2821 max mem: 26157 Train: [77] [ 100/6250] eta: 0:31:46 lr: 0.000017 grad: 0.2205 (0.3517) loss: 0.8028 (0.7952) time: 0.2669 data: 0.0001 max mem: 26157 Train: [77] [ 200/6250] eta: 0:29:06 lr: 0.000017 grad: 0.1992 (0.3891) loss: 0.7959 (0.7935) time: 0.2676 data: 0.0001 max mem: 26157 Train: [77] [ 300/6250] eta: 0:27:55 lr: 0.000017 grad: 0.1773 (0.3416) loss: 0.7935 (0.7928) time: 0.2672 data: 0.0001 max mem: 26157 Train: [77] [ 400/6250] eta: 0:27:06 lr: 0.000017 grad: 0.1894 (0.3337) loss: 0.7871 (0.7919) time: 0.2681 data: 0.0002 max mem: 26157 Train: [77] [ 500/6250] eta: 0:26:27 lr: 0.000017 grad: 0.1707 (0.3288) loss: 0.7894 (0.7914) time: 0.2686 data: 0.0002 max mem: 26157 Train: [77] [ 600/6250] eta: 0:25:51 lr: 0.000017 grad: 0.1864 (0.3249) loss: 0.7881 (0.7908) time: 0.2684 data: 0.0001 max mem: 26157 Train: [77] [ 700/6250] eta: 0:25:18 lr: 0.000017 grad: 0.1825 (0.3144) loss: 0.7839 (0.7902) time: 0.2675 data: 0.0001 max mem: 26157 Train: [77] [ 800/6250] eta: 0:24:46 lr: 0.000017 grad: 0.1854 (0.3110) loss: 0.7843 (0.7895) time: 0.2671 data: 0.0001 max mem: 26157 Train: [77] [ 900/6250] eta: 0:24:16 lr: 0.000017 grad: 0.1919 (0.3069) loss: 0.7825 (0.7888) time: 0.2703 data: 0.0002 max mem: 26157 Train: [77] [1000/6250] eta: 0:23:46 lr: 0.000017 grad: 0.1899 (0.2997) loss: 0.7781 (0.7881) time: 0.2668 data: 0.0002 max mem: 26157 Train: [77] [1100/6250] eta: 0:23:16 lr: 0.000017 grad: 0.1803 (0.2945) loss: 0.7803 (0.7875) time: 0.2666 data: 0.0002 max mem: 26157 Train: [77] [1200/6250] eta: 0:22:47 lr: 0.000017 grad: 0.1962 (0.2979) loss: 0.7819 (0.7868) time: 0.2666 data: 0.0001 max mem: 26157 Train: [77] [1300/6250] eta: 0:22:19 lr: 0.000017 grad: 0.1915 (0.2978) loss: 0.7748 (0.7862) time: 0.2680 data: 0.0002 max mem: 26157 Train: [77] [1400/6250] eta: 0:21:51 lr: 0.000017 grad: 0.2003 (0.3064) loss: 0.7847 (0.7858) time: 0.2672 data: 0.0001 max mem: 26157 Train: [77] [1500/6250] eta: 0:21:23 lr: 0.000017 grad: 0.1756 (0.3074) loss: 0.7804 (0.7854) time: 0.2677 data: 0.0002 max mem: 26157 Train: [77] [1600/6250] eta: 0:20:55 lr: 0.000017 grad: 0.1993 (0.3028) loss: 0.7798 (0.7851) time: 0.2686 data: 0.0002 max mem: 26157 Train: [77] [1700/6250] eta: 0:20:27 lr: 0.000017 grad: 0.1837 (0.3058) loss: 0.7679 (0.7847) time: 0.2673 data: 0.0001 max mem: 26157 Train: [77] [1800/6250] eta: 0:20:00 lr: 0.000017 grad: 0.1878 (0.3047) loss: 0.7817 (0.7844) time: 0.2669 data: 0.0001 max mem: 26157 Train: [77] [1900/6250] eta: 0:19:32 lr: 0.000017 grad: 0.1775 (0.3056) loss: 0.7834 (0.7844) time: 0.2668 data: 0.0001 max mem: 26157 Train: [77] [2000/6250] eta: 0:19:05 lr: 0.000017 grad: 0.1874 (0.3027) loss: 0.7838 (0.7843) time: 0.2671 data: 0.0001 max mem: 26157 Train: [77] [2100/6250] eta: 0:18:37 lr: 0.000017 grad: 0.1861 (0.3024) loss: 0.7802 (0.7841) time: 0.2681 data: 0.0001 max mem: 26157 Train: [77] [2200/6250] eta: 0:18:10 lr: 0.000017 grad: 0.1771 (0.3033) loss: 0.7828 (0.7840) time: 0.2674 data: 0.0001 max mem: 26157 Train: [77] [2300/6250] eta: 0:17:42 lr: 0.000017 grad: 0.1888 (0.3011) loss: 0.7843 (0.7840) time: 0.2664 data: 0.0002 max mem: 26157 Train: [77] [2400/6250] eta: 0:17:15 lr: 0.000017 grad: 0.1872 (0.3010) loss: 0.7835 (0.7840) time: 0.2674 data: 0.0001 max mem: 26157 Train: [77] [2500/6250] eta: 0:16:48 lr: 0.000017 grad: 0.1922 (0.3001) loss: 0.7864 (0.7839) time: 0.2673 data: 0.0001 max mem: 26157 Train: [77] [2600/6250] eta: 0:16:21 lr: 0.000017 grad: 0.1854 (0.3025) loss: 0.7827 (0.7839) time: 0.2677 data: 0.0001 max mem: 26157 Train: [77] [2700/6250] eta: 0:15:54 lr: 0.000017 grad: 0.1938 (0.3019) loss: 0.7792 (0.7838) time: 0.2670 data: 0.0001 max mem: 26157 Train: [77] [2800/6250] eta: 0:15:27 lr: 0.000017 grad: 0.1770 (0.3007) loss: 0.7878 (0.7838) time: 0.2680 data: 0.0001 max mem: 26157 Train: [77] [2900/6250] eta: 0:15:01 lr: 0.000017 grad: 0.1906 (0.3028) loss: 0.7825 (0.7838) time: 0.2668 data: 0.0001 max mem: 26157 Train: [77] [3000/6250] eta: 0:14:33 lr: 0.000017 grad: 0.1867 (0.3050) loss: 0.7811 (0.7837) time: 0.2677 data: 0.0001 max mem: 26157 Train: [77] [3100/6250] eta: 0:14:06 lr: 0.000017 grad: 0.1911 (0.3058) loss: 0.7788 (0.7837) time: 0.2664 data: 0.0001 max mem: 26157 Train: [77] [3200/6250] eta: 0:13:39 lr: 0.000017 grad: 0.1796 (0.3087) loss: 0.7840 (0.7836) time: 0.2670 data: 0.0001 max mem: 26157 Train: [77] [3300/6250] eta: 0:13:12 lr: 0.000016 grad: 0.1974 (0.3124) loss: 0.7794 (0.7836) time: 0.2663 data: 0.0001 max mem: 26157 Train: [77] [3400/6250] eta: 0:12:45 lr: 0.000016 grad: 0.1881 (0.3138) loss: 0.7782 (0.7835) time: 0.2672 data: 0.0001 max mem: 26157 Train: [77] [3500/6250] eta: 0:12:18 lr: 0.000016 grad: 0.1938 (0.3153) loss: 0.7791 (0.7835) time: 0.2663 data: 0.0001 max mem: 26157 Train: [77] [3600/6250] eta: 0:11:51 lr: 0.000016 grad: 0.1977 (0.3182) loss: 0.7874 (0.7835) time: 0.2676 data: 0.0001 max mem: 26157 Train: [77] [3700/6250] eta: 0:11:24 lr: 0.000016 grad: 0.2172 (0.3206) loss: 0.7779 (0.7834) time: 0.2668 data: 0.0001 max mem: 26157 Train: [77] [3800/6250] eta: 0:10:57 lr: 0.000016 grad: 0.1841 (0.3218) loss: 0.7806 (0.7833) time: 0.2676 data: 0.0001 max mem: 26157 Train: [77] [3900/6250] eta: 0:10:30 lr: 0.000016 grad: 0.1796 (0.3213) loss: 0.7818 (0.7832) time: 0.2674 data: 0.0001 max mem: 26157 Train: [77] [4000/6250] eta: 0:10:04 lr: 0.000016 grad: 0.2103 (0.3212) loss: 0.7770 (0.7831) time: 0.2671 data: 0.0001 max mem: 26157 Train: [77] [4100/6250] eta: 0:09:37 lr: 0.000016 grad: 0.1950 (0.3214) loss: 0.7802 (0.7829) time: 0.2669 data: 0.0001 max mem: 26157 Train: [77] [4200/6250] eta: 0:09:10 lr: 0.000016 grad: 0.1941 (0.3224) loss: 0.7775 (0.7828) time: 0.2667 data: 0.0001 max mem: 26157 Train: [77] [4300/6250] eta: 0:08:43 lr: 0.000016 grad: 0.2096 (0.3251) loss: 0.7779 (0.7826) time: 0.2669 data: 0.0001 max mem: 26157 Train: [77] [4400/6250] eta: 0:08:16 lr: 0.000016 grad: 0.1982 (0.3252) loss: 0.7769 (0.7825) time: 0.2668 data: 0.0002 max mem: 26157 Train: [77] [4500/6250] eta: 0:07:49 lr: 0.000016 grad: 0.2452 (0.3260) loss: 0.7755 (0.7823) time: 0.2664 data: 0.0001 max mem: 26157 Train: [77] [4600/6250] eta: 0:07:22 lr: 0.000016 grad: 0.2279 (0.3272) loss: 0.7724 (0.7821) time: 0.2671 data: 0.0001 max mem: 26157 Train: [77] [4700/6250] eta: 0:06:55 lr: 0.000016 grad: 0.1985 (0.3356) loss: 0.7835 (0.7820) time: 0.2670 data: 0.0001 max mem: 26157 Train: [77] [4800/6250] eta: 0:06:28 lr: 0.000016 grad: 0.1802 (0.3348) loss: 0.7824 (0.7819) time: 0.2668 data: 0.0002 max mem: 26157 Train: [77] [4900/6250] eta: 0:06:02 lr: 0.000016 grad: 0.2025 (0.3347) loss: 0.7817 (0.7818) time: 0.2671 data: 0.0001 max mem: 26157 Train: [77] [5000/6250] eta: 0:05:35 lr: 0.000016 grad: 0.1788 (0.3359) loss: 0.7793 (0.7817) time: 0.2672 data: 0.0001 max mem: 26157 Train: [77] [5100/6250] eta: 0:05:08 lr: 0.000016 grad: 0.2019 (0.3363) loss: 0.7683 (0.7816) time: 0.2683 data: 0.0001 max mem: 26157 Train: [77] [5200/6250] eta: 0:04:41 lr: 0.000016 grad: 0.1804 (0.3373) loss: 0.7906 (0.7816) time: 0.2668 data: 0.0002 max mem: 26157 Train: [77] [5300/6250] eta: 0:04:14 lr: 0.000016 grad: 0.1891 (0.3384) loss: 0.7804 (0.7816) time: 0.2665 data: 0.0001 max mem: 26157 Train: [77] [5400/6250] eta: 0:03:47 lr: 0.000016 grad: 0.2030 (0.3383) loss: 0.7765 (0.7816) time: 0.2665 data: 0.0001 max mem: 26157 Train: [77] [5500/6250] eta: 0:03:21 lr: 0.000016 grad: 0.1948 (0.3372) loss: 0.7820 (0.7815) time: 0.2671 data: 0.0001 max mem: 26157 Train: [77] [5600/6250] eta: 0:02:54 lr: 0.000016 grad: 0.1758 (0.3375) loss: 0.7808 (0.7815) time: 0.2665 data: 0.0001 max mem: 26157 Train: [77] [5700/6250] eta: 0:02:27 lr: 0.000016 grad: 0.1837 (0.3366) loss: 0.7760 (0.7814) time: 0.2671 data: 0.0001 max mem: 26157 Train: [77] [5800/6250] eta: 0:02:00 lr: 0.000016 grad: 0.1924 (0.3373) loss: 0.7780 (0.7813) time: 0.2667 data: 0.0001 max mem: 26157 Train: [77] [5900/6250] eta: 0:01:33 lr: 0.000016 grad: 0.1999 (0.3397) loss: 0.7824 (0.7813) time: 0.2665 data: 0.0001 max mem: 26157 Train: [77] [6000/6250] eta: 0:01:06 lr: 0.000016 grad: 0.1993 (0.3416) loss: 0.7827 (0.7812) time: 0.2671 data: 0.0001 max mem: 26157 Train: [77] [6100/6250] eta: 0:00:40 lr: 0.000016 grad: 0.2190 (0.3413) loss: 0.7757 (0.7811) time: 0.2682 data: 0.0001 max mem: 26157 Train: [77] [6200/6250] eta: 0:00:13 lr: 0.000016 grad: 0.2049 (0.3404) loss: 0.7707 (0.7811) time: 0.2680 data: 0.0001 max mem: 26157 Train: [77] [6249/6250] eta: 0:00:00 lr: 0.000016 grad: 0.2006 (0.3406) loss: 0.7651 (0.7810) time: 0.2669 data: 0.0001 max mem: 26157 Train: [77] Total time: 0:27:57 (0.2684 s / it) Averaged stats: lr: 0.000016 grad: 0.2006 (0.3406) loss: 0.7651 (0.7810) Eval (hcp-train-subset): [77] [ 0/62] eta: 0:05:07 loss: 0.8078 (0.8078) time: 4.9547 data: 4.8717 max mem: 26157 Eval (hcp-train-subset): [77] [61/62] eta: 0:00:00 loss: 0.7969 (0.8010) time: 0.1178 data: 0.0354 max mem: 26157 Eval (hcp-train-subset): [77] Total time: 0:00:11 (0.1847 s / it) Averaged stats (hcp-train-subset): loss: 0.7969 (0.8010) Making plots (hcp-train-subset): example=12 Eval (hcp-val): [77] [ 0/62] eta: 0:04:13 loss: 0.8202 (0.8202) time: 4.0936 data: 4.0105 max mem: 26157 Eval (hcp-val): [77] [61/62] eta: 0:00:00 loss: 0.8227 (0.8242) time: 0.0924 data: 0.0082 max mem: 26157 Eval (hcp-val): [77] Total time: 0:00:11 (0.1782 s / it) Averaged stats (hcp-val): loss: 0.8227 (0.8242) Making plots (hcp-val): example=43 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [78] [ 0/6250] eta: 8:44:04 lr: 0.000016 grad: 0.2683 (0.2683) loss: 0.8228 (0.8228) time: 5.0311 data: 4.7617 max mem: 26157 Train: [78] [ 100/6250] eta: 0:32:17 lr: 0.000016 grad: 0.2510 (0.5075) loss: 0.7911 (0.7919) time: 0.2676 data: 0.0001 max mem: 26157 Train: [78] [ 200/6250] eta: 0:29:21 lr: 0.000016 grad: 0.2578 (0.4132) loss: 0.7753 (0.7822) time: 0.2687 data: 0.0002 max mem: 26157 Train: [78] [ 300/6250] eta: 0:28:04 lr: 0.000016 grad: 0.2032 (0.3727) loss: 0.7885 (0.7807) time: 0.2673 data: 0.0001 max mem: 26157 Train: [78] [ 400/6250] eta: 0:27:12 lr: 0.000016 grad: 0.2109 (0.3697) loss: 0.7758 (0.7800) time: 0.2676 data: 0.0001 max mem: 26157 Train: [78] [ 500/6250] eta: 0:26:31 lr: 0.000016 grad: 0.2083 (0.3750) loss: 0.7954 (0.7810) time: 0.2683 data: 0.0001 max mem: 26157 Train: [78] [ 600/6250] eta: 0:25:54 lr: 0.000016 grad: 0.1913 (0.3577) loss: 0.7927 (0.7825) time: 0.2660 data: 0.0002 max mem: 26157 Train: [78] [ 700/6250] eta: 0:25:20 lr: 0.000016 grad: 0.1888 (0.3428) loss: 0.7886 (0.7839) time: 0.2688 data: 0.0002 max mem: 26157 Train: [78] [ 800/6250] eta: 0:24:48 lr: 0.000016 grad: 0.1866 (0.3486) loss: 0.7926 (0.7850) time: 0.2669 data: 0.0001 max mem: 26157 Train: [78] [ 900/6250] eta: 0:24:17 lr: 0.000016 grad: 0.1849 (0.3360) loss: 0.7888 (0.7855) time: 0.2669 data: 0.0002 max mem: 26157 Train: [78] [1000/6250] eta: 0:23:47 lr: 0.000016 grad: 0.1786 (0.3315) loss: 0.7879 (0.7860) time: 0.2677 data: 0.0001 max mem: 26157 Train: [78] [1100/6250] eta: 0:23:17 lr: 0.000016 grad: 0.1896 (0.3328) loss: 0.7857 (0.7861) time: 0.2667 data: 0.0002 max mem: 26157 Train: [78] [1200/6250] eta: 0:22:49 lr: 0.000016 grad: 0.1790 (0.3298) loss: 0.7771 (0.7862) time: 0.2669 data: 0.0002 max mem: 26157 Train: [78] [1300/6250] eta: 0:22:20 lr: 0.000016 grad: 0.1942 (0.3319) loss: 0.7790 (0.7861) time: 0.2678 data: 0.0002 max mem: 26157 Train: [78] [1400/6250] eta: 0:21:52 lr: 0.000016 grad: 0.1848 (0.3371) loss: 0.7786 (0.7858) time: 0.2679 data: 0.0002 max mem: 26157 Train: [78] [1500/6250] eta: 0:21:24 lr: 0.000015 grad: 0.1800 (0.3327) loss: 0.7800 (0.7855) time: 0.2667 data: 0.0001 max mem: 26157 Train: [78] [1600/6250] eta: 0:20:56 lr: 0.000015 grad: 0.1942 (0.3280) loss: 0.7789 (0.7852) time: 0.2670 data: 0.0001 max mem: 26157 Train: [78] [1700/6250] eta: 0:20:28 lr: 0.000015 grad: 0.1916 (0.3278) loss: 0.7833 (0.7849) time: 0.2667 data: 0.0001 max mem: 26157 Train: [78] [1800/6250] eta: 0:20:00 lr: 0.000015 grad: 0.1747 (0.3230) loss: 0.7869 (0.7849) time: 0.2667 data: 0.0002 max mem: 26157 Train: [78] [1900/6250] eta: 0:19:32 lr: 0.000015 grad: 0.1993 (0.3214) loss: 0.7853 (0.7848) time: 0.2667 data: 0.0001 max mem: 26157 Train: [78] [2000/6250] eta: 0:19:05 lr: 0.000015 grad: 0.1876 (0.3179) loss: 0.7816 (0.7849) time: 0.2664 data: 0.0001 max mem: 26157 Train: [78] [2100/6250] eta: 0:18:37 lr: 0.000015 grad: 0.1905 (0.3169) loss: 0.7822 (0.7847) time: 0.2671 data: 0.0001 max mem: 26157 Train: [78] [2200/6250] eta: 0:18:10 lr: 0.000015 grad: 0.1803 (0.3149) loss: 0.7917 (0.7848) time: 0.2666 data: 0.0001 max mem: 26157 Train: [78] [2300/6250] eta: 0:17:43 lr: 0.000015 grad: 0.1893 (0.3180) loss: 0.7897 (0.7847) time: 0.2661 data: 0.0001 max mem: 26157 Train: [78] [2400/6250] eta: 0:17:15 lr: 0.000015 grad: 0.1812 (0.3187) loss: 0.7881 (0.7847) time: 0.2667 data: 0.0001 max mem: 26157 Train: [78] [2500/6250] eta: 0:16:48 lr: 0.000015 grad: 0.1959 (0.3175) loss: 0.7811 (0.7846) time: 0.2668 data: 0.0001 max mem: 26157 Train: [78] [2600/6250] eta: 0:16:21 lr: 0.000015 grad: 0.1914 (0.3168) loss: 0.7774 (0.7845) time: 0.2673 data: 0.0001 max mem: 26157 Train: [78] [2700/6250] eta: 0:15:54 lr: 0.000015 grad: 0.1883 (0.3151) loss: 0.7848 (0.7844) time: 0.2662 data: 0.0001 max mem: 26157 Train: [78] [2800/6250] eta: 0:15:27 lr: 0.000015 grad: 0.1863 (0.3174) loss: 0.7919 (0.7845) time: 0.2676 data: 0.0002 max mem: 26157 Train: [78] [2900/6250] eta: 0:15:00 lr: 0.000015 grad: 0.1879 (0.3172) loss: 0.7785 (0.7844) time: 0.2663 data: 0.0001 max mem: 26157 Train: [78] [3000/6250] eta: 0:14:33 lr: 0.000015 grad: 0.1863 (0.3146) loss: 0.7854 (0.7843) time: 0.2665 data: 0.0001 max mem: 26157 Train: [78] [3100/6250] eta: 0:14:05 lr: 0.000015 grad: 0.1909 (0.3179) loss: 0.7852 (0.7842) time: 0.2664 data: 0.0002 max mem: 26157 Train: [78] [3200/6250] eta: 0:13:38 lr: 0.000015 grad: 0.1868 (0.3180) loss: 0.7883 (0.7841) time: 0.2668 data: 0.0001 max mem: 26157 Train: [78] [3300/6250] eta: 0:13:11 lr: 0.000015 grad: 0.1827 (0.3211) loss: 0.7827 (0.7840) time: 0.2668 data: 0.0001 max mem: 26157 Train: [78] [3400/6250] eta: 0:12:44 lr: 0.000015 grad: 0.1962 (0.3203) loss: 0.7794 (0.7839) time: 0.2676 data: 0.0001 max mem: 26157 Train: [78] [3500/6250] eta: 0:12:18 lr: 0.000015 grad: 0.2015 (0.3209) loss: 0.7902 (0.7837) time: 0.2667 data: 0.0001 max mem: 26157 Train: [78] [3600/6250] eta: 0:11:51 lr: 0.000015 grad: 0.1991 (0.3218) loss: 0.7814 (0.7836) time: 0.2664 data: 0.0001 max mem: 26157 Train: [78] [3700/6250] eta: 0:11:24 lr: 0.000015 grad: 0.2202 (0.3191) loss: 0.7808 (0.7835) time: 0.2658 data: 0.0001 max mem: 26157 Train: [78] [3800/6250] eta: 0:10:57 lr: 0.000015 grad: 0.1808 (0.3184) loss: 0.7785 (0.7834) time: 0.2660 data: 0.0001 max mem: 26157 Train: [78] [3900/6250] eta: 0:10:30 lr: 0.000015 grad: 0.1985 (0.3180) loss: 0.7857 (0.7833) time: 0.2670 data: 0.0001 max mem: 26157 Train: [78] [4000/6250] eta: 0:10:03 lr: 0.000015 grad: 0.1980 (0.3166) loss: 0.7843 (0.7831) time: 0.2672 data: 0.0001 max mem: 26157 Train: [78] [4100/6250] eta: 0:09:36 lr: 0.000015 grad: 0.1916 (0.3164) loss: 0.7848 (0.7831) time: 0.2670 data: 0.0002 max mem: 26157 Train: [78] [4200/6250] eta: 0:09:09 lr: 0.000015 grad: 0.1839 (0.3153) loss: 0.7802 (0.7829) time: 0.2662 data: 0.0001 max mem: 26157 Train: [78] [4300/6250] eta: 0:08:42 lr: 0.000015 grad: 0.1990 (0.3215) loss: 0.7747 (0.7829) time: 0.2662 data: 0.0001 max mem: 26157 Train: [78] [4400/6250] eta: 0:08:15 lr: 0.000015 grad: 0.1899 (0.3199) loss: 0.7788 (0.7828) time: 0.2668 data: 0.0001 max mem: 26157 Train: [78] [4500/6250] eta: 0:07:49 lr: 0.000015 grad: 0.1861 (0.3187) loss: 0.7774 (0.7827) time: 0.2667 data: 0.0002 max mem: 26157 Train: [78] [4600/6250] eta: 0:07:22 lr: 0.000015 grad: 0.2011 (0.3180) loss: 0.7694 (0.7826) time: 0.2663 data: 0.0001 max mem: 26157 Train: [78] [4700/6250] eta: 0:06:55 lr: 0.000015 grad: 0.1873 (0.3184) loss: 0.7795 (0.7825) time: 0.2671 data: 0.0001 max mem: 26157 Train: [78] [4800/6250] eta: 0:06:28 lr: 0.000015 grad: 0.2014 (0.3180) loss: 0.7778 (0.7824) time: 0.2661 data: 0.0001 max mem: 26157 Train: [78] [4900/6250] eta: 0:06:01 lr: 0.000015 grad: 0.1937 (0.3201) loss: 0.7890 (0.7823) time: 0.2668 data: 0.0001 max mem: 26157 Train: [78] [5000/6250] eta: 0:05:34 lr: 0.000015 grad: 0.1883 (0.3203) loss: 0.7804 (0.7822) time: 0.2665 data: 0.0001 max mem: 26157 Train: [78] [5100/6250] eta: 0:05:08 lr: 0.000015 grad: 0.1861 (0.3206) loss: 0.7832 (0.7822) time: 0.2677 data: 0.0001 max mem: 26157 Train: [78] [5200/6250] eta: 0:04:41 lr: 0.000015 grad: 0.1874 (0.3198) loss: 0.7813 (0.7822) time: 0.2677 data: 0.0001 max mem: 26157 Train: [78] [5300/6250] eta: 0:04:14 lr: 0.000015 grad: 0.1961 (0.3223) loss: 0.7852 (0.7821) time: 0.2679 data: 0.0001 max mem: 26157 Train: [78] [5400/6250] eta: 0:03:47 lr: 0.000015 grad: 0.1963 (0.3243) loss: 0.7772 (0.7820) time: 0.2662 data: 0.0001 max mem: 26157 Train: [78] [5500/6250] eta: 0:03:20 lr: 0.000015 grad: 0.1948 (0.3253) loss: 0.7799 (0.7820) time: 0.2657 data: 0.0001 max mem: 26157 Train: [78] [5600/6250] eta: 0:02:54 lr: 0.000015 grad: 0.1911 (0.3241) loss: 0.7831 (0.7819) time: 0.2671 data: 0.0001 max mem: 26157 Train: [78] [5700/6250] eta: 0:02:27 lr: 0.000015 grad: 0.2098 (0.3244) loss: 0.7790 (0.7818) time: 0.2660 data: 0.0001 max mem: 26157 Train: [78] [5800/6250] eta: 0:02:00 lr: 0.000015 grad: 0.1945 (0.3243) loss: 0.7712 (0.7817) time: 0.2661 data: 0.0001 max mem: 26157 Train: [78] [5900/6250] eta: 0:01:33 lr: 0.000015 grad: 0.1917 (0.3242) loss: 0.7782 (0.7816) time: 0.2667 data: 0.0002 max mem: 26157 Train: [78] [6000/6250] eta: 0:01:06 lr: 0.000015 grad: 0.1932 (0.3268) loss: 0.7744 (0.7815) time: 0.2668 data: 0.0001 max mem: 26157 Train: [78] [6100/6250] eta: 0:00:40 lr: 0.000015 grad: 0.1905 (0.3269) loss: 0.7791 (0.7814) time: 0.2660 data: 0.0001 max mem: 26157 Train: [78] [6200/6250] eta: 0:00:13 lr: 0.000014 grad: 0.1871 (0.3279) loss: 0.7830 (0.7813) time: 0.2670 data: 0.0001 max mem: 26157 Train: [78] [6249/6250] eta: 0:00:00 lr: 0.000014 grad: 0.1939 (0.3280) loss: 0.7732 (0.7812) time: 0.2663 data: 0.0001 max mem: 26157 Train: [78] Total time: 0:27:56 (0.2682 s / it) Averaged stats: lr: 0.000014 grad: 0.1939 (0.3280) loss: 0.7732 (0.7812) Eval (hcp-train-subset): [78] [ 0/62] eta: 0:04:37 loss: 0.8102 (0.8102) time: 4.4824 data: 4.3991 max mem: 26157 Eval (hcp-train-subset): [78] [61/62] eta: 0:00:00 loss: 0.7954 (0.7999) time: 0.0906 data: 0.0065 max mem: 26157 Eval (hcp-train-subset): [78] Total time: 0:00:10 (0.1672 s / it) Averaged stats (hcp-train-subset): loss: 0.7954 (0.7999) Making plots (hcp-train-subset): example=62 Eval (hcp-val): [78] [ 0/62] eta: 0:04:12 loss: 0.8197 (0.8197) time: 4.0784 data: 3.9952 max mem: 26157 Eval (hcp-val): [78] [61/62] eta: 0:00:00 loss: 0.8227 (0.8245) time: 0.0893 data: 0.0072 max mem: 26157 Eval (hcp-val): [78] Total time: 0:00:10 (0.1674 s / it) Averaged stats (hcp-val): loss: 0.8227 (0.8245) Making plots (hcp-val): example=29 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [79] [ 0/6250] eta: 6:14:29 lr: 0.000014 grad: 0.1601 (0.1601) loss: 0.8393 (0.8393) time: 3.5951 data: 3.2735 max mem: 26157 Train: [79] [ 100/6250] eta: 0:32:02 lr: 0.000014 grad: 0.2150 (0.3866) loss: 0.7990 (0.7917) time: 0.2684 data: 0.0002 max mem: 26157 Train: [79] [ 200/6250] eta: 0:29:17 lr: 0.000014 grad: 0.2119 (0.3676) loss: 0.7931 (0.7890) time: 0.2672 data: 0.0002 max mem: 26157 Train: [79] [ 300/6250] eta: 0:28:03 lr: 0.000014 grad: 0.1789 (0.3233) loss: 0.7914 (0.7872) time: 0.2679 data: 0.0001 max mem: 26157 Train: [79] [ 400/6250] eta: 0:27:13 lr: 0.000014 grad: 0.1881 (0.3375) loss: 0.7780 (0.7869) time: 0.2696 data: 0.0002 max mem: 26157 Train: [79] [ 500/6250] eta: 0:26:32 lr: 0.000014 grad: 0.2059 (0.3580) loss: 0.7774 (0.7863) time: 0.2677 data: 0.0001 max mem: 26157 Train: [79] [ 600/6250] eta: 0:25:55 lr: 0.000014 grad: 0.1894 (0.3460) loss: 0.7799 (0.7854) time: 0.2669 data: 0.0001 max mem: 26157 Train: [79] [ 700/6250] eta: 0:25:21 lr: 0.000014 grad: 0.1890 (0.3329) loss: 0.7828 (0.7847) time: 0.2666 data: 0.0001 max mem: 26157 Train: [79] [ 800/6250] eta: 0:24:49 lr: 0.000014 grad: 0.2060 (0.3381) loss: 0.7721 (0.7837) time: 0.2670 data: 0.0001 max mem: 26157 Train: [79] [ 900/6250] eta: 0:24:18 lr: 0.000014 grad: 0.2025 (0.3343) loss: 0.7742 (0.7828) time: 0.2676 data: 0.0002 max mem: 26157 Train: [79] [1000/6250] eta: 0:23:48 lr: 0.000014 grad: 0.1932 (0.3296) loss: 0.7792 (0.7820) time: 0.2669 data: 0.0001 max mem: 26157 Train: [79] [1100/6250] eta: 0:23:18 lr: 0.000014 grad: 0.1873 (0.3239) loss: 0.7732 (0.7815) time: 0.2674 data: 0.0001 max mem: 26157 Train: [79] [1200/6250] eta: 0:22:49 lr: 0.000014 grad: 0.1972 (0.3243) loss: 0.7765 (0.7810) time: 0.2660 data: 0.0001 max mem: 26157 Train: [79] [1300/6250] eta: 0:22:20 lr: 0.000014 grad: 0.1891 (0.3235) loss: 0.7782 (0.7805) time: 0.2673 data: 0.0002 max mem: 26157 Train: [79] [1400/6250] eta: 0:21:52 lr: 0.000014 grad: 0.1918 (0.3228) loss: 0.7736 (0.7801) time: 0.2669 data: 0.0001 max mem: 26157 Train: [79] [1500/6250] eta: 0:21:24 lr: 0.000014 grad: 0.1951 (0.3222) loss: 0.7786 (0.7800) time: 0.2668 data: 0.0001 max mem: 26157 Train: [79] [1600/6250] eta: 0:20:56 lr: 0.000014 grad: 0.1874 (0.3230) loss: 0.7734 (0.7799) time: 0.2672 data: 0.0002 max mem: 26157 Train: [79] [1700/6250] eta: 0:20:28 lr: 0.000014 grad: 0.2157 (0.3295) loss: 0.7812 (0.7799) time: 0.2666 data: 0.0001 max mem: 26157 Train: [79] [1800/6250] eta: 0:20:00 lr: 0.000014 grad: 0.2053 (0.3271) loss: 0.7793 (0.7798) time: 0.2661 data: 0.0001 max mem: 26157 Train: [79] [1900/6250] eta: 0:19:32 lr: 0.000014 grad: 0.1794 (0.3255) loss: 0.7805 (0.7798) time: 0.2671 data: 0.0001 max mem: 26157 Train: [79] [2000/6250] eta: 0:19:05 lr: 0.000014 grad: 0.1887 (0.3317) loss: 0.7811 (0.7796) time: 0.2662 data: 0.0001 max mem: 26157 Train: [79] [2100/6250] eta: 0:18:37 lr: 0.000014 grad: 0.2080 (0.3295) loss: 0.7803 (0.7796) time: 0.2668 data: 0.0001 max mem: 26157 Train: [79] [2200/6250] eta: 0:18:10 lr: 0.000014 grad: 0.1863 (0.3299) loss: 0.7876 (0.7798) time: 0.2675 data: 0.0001 max mem: 26157 Train: [79] [2300/6250] eta: 0:17:43 lr: 0.000014 grad: 0.1790 (0.3307) loss: 0.7805 (0.7799) time: 0.2674 data: 0.0001 max mem: 26157 Train: [79] [2400/6250] eta: 0:17:16 lr: 0.000014 grad: 0.1866 (0.3314) loss: 0.7791 (0.7799) time: 0.2672 data: 0.0001 max mem: 26157 Train: [79] [2500/6250] eta: 0:16:48 lr: 0.000014 grad: 0.1968 (0.3328) loss: 0.7720 (0.7799) time: 0.2684 data: 0.0002 max mem: 26157 Train: [79] [2600/6250] eta: 0:16:21 lr: 0.000014 grad: 0.1965 (0.3312) loss: 0.7709 (0.7799) time: 0.2673 data: 0.0001 max mem: 26157 Train: [79] [2700/6250] eta: 0:15:54 lr: 0.000014 grad: 0.1811 (0.3335) loss: 0.7798 (0.7799) time: 0.2685 data: 0.0002 max mem: 26157 Train: [79] [2800/6250] eta: 0:15:27 lr: 0.000014 grad: 0.1990 (0.3373) loss: 0.7826 (0.7799) time: 0.2667 data: 0.0001 max mem: 26157 Train: [79] [2900/6250] eta: 0:15:00 lr: 0.000014 grad: 0.1833 (0.3385) loss: 0.7857 (0.7799) time: 0.2684 data: 0.0001 max mem: 26157 Train: [79] [3000/6250] eta: 0:14:33 lr: 0.000014 grad: 0.2078 (0.3397) loss: 0.7833 (0.7798) time: 0.2675 data: 0.0001 max mem: 26157 Train: [79] [3100/6250] eta: 0:14:06 lr: 0.000014 grad: 0.2112 (0.3396) loss: 0.7857 (0.7799) time: 0.2668 data: 0.0001 max mem: 26157 Train: [79] [3200/6250] eta: 0:13:39 lr: 0.000014 grad: 0.1809 (0.3375) loss: 0.7766 (0.7800) time: 0.2671 data: 0.0001 max mem: 26157 Train: [79] [3300/6250] eta: 0:13:12 lr: 0.000014 grad: 0.1940 (0.3364) loss: 0.7770 (0.7800) time: 0.2674 data: 0.0001 max mem: 26157 Train: [79] [3400/6250] eta: 0:12:45 lr: 0.000014 grad: 0.2049 (0.3367) loss: 0.7800 (0.7800) time: 0.2676 data: 0.0001 max mem: 26157 Train: [79] [3500/6250] eta: 0:12:18 lr: 0.000014 grad: 0.1859 (0.3342) loss: 0.7821 (0.7799) time: 0.2678 data: 0.0001 max mem: 26157 Train: [79] [3600/6250] eta: 0:11:51 lr: 0.000014 grad: 0.1917 (0.3331) loss: 0.7790 (0.7799) time: 0.2665 data: 0.0001 max mem: 26157 Train: [79] [3700/6250] eta: 0:11:24 lr: 0.000014 grad: 0.1874 (0.3346) loss: 0.7873 (0.7799) time: 0.2673 data: 0.0001 max mem: 26157 Train: [79] [3800/6250] eta: 0:10:57 lr: 0.000014 grad: 0.1917 (0.3317) loss: 0.7704 (0.7799) time: 0.2670 data: 0.0001 max mem: 26157 Train: [79] [3900/6250] eta: 0:10:30 lr: 0.000014 grad: 0.1886 (0.3306) loss: 0.7833 (0.7799) time: 0.2677 data: 0.0001 max mem: 26157 Train: [79] [4000/6250] eta: 0:10:03 lr: 0.000014 grad: 0.1914 (0.3296) loss: 0.7822 (0.7798) time: 0.2665 data: 0.0001 max mem: 26157 Train: [79] [4100/6250] eta: 0:09:36 lr: 0.000014 grad: 0.2058 (0.3334) loss: 0.7729 (0.7797) time: 0.2664 data: 0.0001 max mem: 26157 Train: [79] [4200/6250] eta: 0:09:09 lr: 0.000014 grad: 0.2019 (0.3326) loss: 0.7791 (0.7797) time: 0.2664 data: 0.0001 max mem: 26157 Train: [79] [4300/6250] eta: 0:08:43 lr: 0.000014 grad: 0.1895 (0.3333) loss: 0.7862 (0.7796) time: 0.2675 data: 0.0002 max mem: 26157 Train: [79] [4400/6250] eta: 0:08:16 lr: 0.000014 grad: 0.1838 (0.3311) loss: 0.7812 (0.7795) time: 0.2662 data: 0.0001 max mem: 26157 Train: [79] [4500/6250] eta: 0:07:49 lr: 0.000014 grad: 0.1970 (0.3305) loss: 0.7741 (0.7796) time: 0.2667 data: 0.0001 max mem: 26157 Train: [79] [4600/6250] eta: 0:07:22 lr: 0.000014 grad: 0.1823 (0.3306) loss: 0.7802 (0.7795) time: 0.2669 data: 0.0001 max mem: 26157 Train: [79] [4700/6250] eta: 0:06:55 lr: 0.000013 grad: 0.1860 (0.3310) loss: 0.7779 (0.7795) time: 0.2670 data: 0.0001 max mem: 26157 Train: [79] [4800/6250] eta: 0:06:28 lr: 0.000013 grad: 0.1922 (0.3305) loss: 0.7791 (0.7795) time: 0.2673 data: 0.0001 max mem: 26157 Train: [79] [4900/6250] eta: 0:06:01 lr: 0.000013 grad: 0.1969 (0.3314) loss: 0.7795 (0.7795) time: 0.2671 data: 0.0002 max mem: 26157 Train: [79] [5000/6250] eta: 0:05:35 lr: 0.000013 grad: 0.1829 (0.3297) loss: 0.7822 (0.7795) time: 0.2673 data: 0.0001 max mem: 26157 Train: [79] [5100/6250] eta: 0:05:08 lr: 0.000013 grad: 0.2023 (0.3281) loss: 0.7778 (0.7795) time: 0.2668 data: 0.0002 max mem: 26157 Train: [79] [5200/6250] eta: 0:04:41 lr: 0.000013 grad: 0.1941 (0.3297) loss: 0.7794 (0.7796) time: 0.2670 data: 0.0002 max mem: 26157 Train: [79] [5300/6250] eta: 0:04:14 lr: 0.000013 grad: 0.1793 (0.3292) loss: 0.7878 (0.7796) time: 0.2660 data: 0.0001 max mem: 26157 Train: [79] [5400/6250] eta: 0:03:47 lr: 0.000013 grad: 0.2134 (0.3285) loss: 0.7826 (0.7796) time: 0.2669 data: 0.0001 max mem: 26157 Train: [79] [5500/6250] eta: 0:03:20 lr: 0.000013 grad: 0.1843 (0.3267) loss: 0.7853 (0.7797) time: 0.2674 data: 0.0001 max mem: 26157 Train: [79] [5600/6250] eta: 0:02:54 lr: 0.000013 grad: 0.1824 (0.3267) loss: 0.7838 (0.7798) time: 0.2675 data: 0.0002 max mem: 26157 Train: [79] [5700/6250] eta: 0:02:27 lr: 0.000013 grad: 0.1970 (0.3262) loss: 0.7805 (0.7798) time: 0.2680 data: 0.0002 max mem: 26157 Train: [79] [5800/6250] eta: 0:02:00 lr: 0.000013 grad: 0.1833 (0.3258) loss: 0.7784 (0.7798) time: 0.2678 data: 0.0001 max mem: 26157 Train: [79] [5900/6250] eta: 0:01:33 lr: 0.000013 grad: 0.2205 (0.3254) loss: 0.7742 (0.7797) time: 0.2667 data: 0.0001 max mem: 26157 Train: [79] [6000/6250] eta: 0:01:06 lr: 0.000013 grad: 0.2000 (0.3240) loss: 0.7759 (0.7797) time: 0.2676 data: 0.0002 max mem: 26157 Train: [79] [6100/6250] eta: 0:00:40 lr: 0.000013 grad: 0.2056 (0.3241) loss: 0.7794 (0.7797) time: 0.2663 data: 0.0001 max mem: 26157 Train: [79] [6200/6250] eta: 0:00:13 lr: 0.000013 grad: 0.1906 (0.3237) loss: 0.7747 (0.7796) time: 0.2667 data: 0.0001 max mem: 26157 Train: [79] [6249/6250] eta: 0:00:00 lr: 0.000013 grad: 0.1884 (0.3232) loss: 0.7880 (0.7796) time: 0.2671 data: 0.0001 max mem: 26157 Train: [79] Total time: 0:27:57 (0.2684 s / it) Averaged stats: lr: 0.000013 grad: 0.1884 (0.3232) loss: 0.7880 (0.7796) Eval (hcp-train-subset): [79] [ 0/62] eta: 0:04:47 loss: 0.8078 (0.8078) time: 4.6416 data: 4.5583 max mem: 26157 Eval (hcp-train-subset): [79] [61/62] eta: 0:00:00 loss: 0.7960 (0.8004) time: 0.0905 data: 0.0080 max mem: 26157 Eval (hcp-train-subset): [79] Total time: 0:00:10 (0.1722 s / it) Averaged stats (hcp-train-subset): loss: 0.7960 (0.8004) Making plots (hcp-train-subset): example=9 Eval (hcp-val): [79] [ 0/62] eta: 0:03:19 loss: 0.8186 (0.8186) time: 3.2183 data: 3.1116 max mem: 26157 Eval (hcp-val): [79] [61/62] eta: 0:00:00 loss: 0.8226 (0.8238) time: 0.0979 data: 0.0153 max mem: 26157 Eval (hcp-val): [79] Total time: 0:00:11 (0.1850 s / it) Averaged stats (hcp-val): loss: 0.8226 (0.8238) Making plots (hcp-val): example=37 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [80] [ 0/6250] eta: 6:43:18 lr: 0.000013 grad: 0.1634 (0.1634) loss: 0.8307 (0.8307) time: 3.8717 data: 3.5267 max mem: 26157 Train: [80] [ 100/6250] eta: 0:32:03 lr: 0.000013 grad: 0.2215 (0.4047) loss: 0.7985 (0.8101) time: 0.2680 data: 0.0001 max mem: 26157 Train: [80] [ 200/6250] eta: 0:29:16 lr: 0.000013 grad: 0.2431 (0.3340) loss: 0.7884 (0.8015) time: 0.2677 data: 0.0001 max mem: 26157 Train: [80] [ 300/6250] eta: 0:28:00 lr: 0.000013 grad: 0.2161 (0.3169) loss: 0.7918 (0.7975) time: 0.2666 data: 0.0002 max mem: 26157 Train: [80] [ 400/6250] eta: 0:27:10 lr: 0.000013 grad: 0.1776 (0.3194) loss: 0.7961 (0.7961) time: 0.2675 data: 0.0001 max mem: 26157 Train: [80] [ 500/6250] eta: 0:26:29 lr: 0.000013 grad: 0.2100 (0.3055) loss: 0.7800 (0.7950) time: 0.2674 data: 0.0001 max mem: 26157 Train: [80] [ 600/6250] eta: 0:25:53 lr: 0.000013 grad: 0.2024 (0.2982) loss: 0.7842 (0.7940) time: 0.2662 data: 0.0001 max mem: 26157 Train: [80] [ 700/6250] eta: 0:25:19 lr: 0.000013 grad: 0.2118 (0.3085) loss: 0.7859 (0.7932) time: 0.2665 data: 0.0001 max mem: 26157 Train: [80] [ 800/6250] eta: 0:24:47 lr: 0.000013 grad: 0.1895 (0.3037) loss: 0.7827 (0.7921) time: 0.2671 data: 0.0001 max mem: 26157 Train: [80] [ 900/6250] eta: 0:24:16 lr: 0.000013 grad: 0.2037 (0.3210) loss: 0.7748 (0.7911) time: 0.2670 data: 0.0001 max mem: 26157 Train: [80] [1000/6250] eta: 0:23:46 lr: 0.000013 grad: 0.1974 (0.3176) loss: 0.7823 (0.7903) time: 0.2668 data: 0.0001 max mem: 26157 Train: [80] [1100/6250] eta: 0:23:17 lr: 0.000013 grad: 0.2046 (0.3165) loss: 0.7720 (0.7894) time: 0.2663 data: 0.0002 max mem: 26157 Train: [80] [1200/6250] eta: 0:22:48 lr: 0.000013 grad: 0.2016 (0.3177) loss: 0.7754 (0.7887) time: 0.2675 data: 0.0001 max mem: 26157 Train: [80] [1300/6250] eta: 0:22:19 lr: 0.000013 grad: 0.1922 (0.3161) loss: 0.7823 (0.7880) time: 0.2666 data: 0.0001 max mem: 26157 Train: [80] [1400/6250] eta: 0:21:51 lr: 0.000013 grad: 0.1878 (0.3181) loss: 0.7839 (0.7876) time: 0.2671 data: 0.0002 max mem: 26157 Train: [80] [1500/6250] eta: 0:21:23 lr: 0.000013 grad: 0.2053 (0.3158) loss: 0.7757 (0.7872) time: 0.2664 data: 0.0001 max mem: 26157 Train: [80] [1600/6250] eta: 0:20:55 lr: 0.000013 grad: 0.1964 (0.3112) loss: 0.7769 (0.7867) time: 0.2666 data: 0.0001 max mem: 26157 Train: [80] [1700/6250] eta: 0:20:27 lr: 0.000013 grad: 0.2191 (0.3125) loss: 0.7753 (0.7863) time: 0.2664 data: 0.0001 max mem: 26157 Train: [80] [1800/6250] eta: 0:19:59 lr: 0.000013 grad: 0.1895 (0.3121) loss: 0.7768 (0.7859) time: 0.2668 data: 0.0001 max mem: 26157 Train: [80] [1900/6250] eta: 0:19:32 lr: 0.000013 grad: 0.1990 (0.3116) loss: 0.7726 (0.7854) time: 0.2675 data: 0.0002 max mem: 26157 Train: [80] [2000/6250] eta: 0:19:04 lr: 0.000013 grad: 0.1867 (0.3083) loss: 0.7785 (0.7850) time: 0.2674 data: 0.0001 max mem: 26157 Train: [80] [2100/6250] eta: 0:18:37 lr: 0.000013 grad: 0.1878 (0.3068) loss: 0.7683 (0.7846) time: 0.2672 data: 0.0002 max mem: 26157 Train: [80] [2200/6250] eta: 0:18:09 lr: 0.000013 grad: 0.1962 (0.3089) loss: 0.7855 (0.7844) time: 0.2669 data: 0.0001 max mem: 26157 Train: [80] [2300/6250] eta: 0:17:42 lr: 0.000013 grad: 0.1815 (0.3099) loss: 0.7804 (0.7841) time: 0.2670 data: 0.0001 max mem: 26157 Train: [80] [2400/6250] eta: 0:17:15 lr: 0.000013 grad: 0.1792 (0.3081) loss: 0.7835 (0.7839) time: 0.2658 data: 0.0001 max mem: 26157 Train: [80] [2500/6250] eta: 0:16:48 lr: 0.000013 grad: 0.1905 (0.3066) loss: 0.7850 (0.7838) time: 0.2671 data: 0.0001 max mem: 26157 Train: [80] [2600/6250] eta: 0:16:21 lr: 0.000013 grad: 0.1888 (0.3074) loss: 0.7795 (0.7836) time: 0.2680 data: 0.0001 max mem: 26157 Train: [80] [2700/6250] eta: 0:15:54 lr: 0.000013 grad: 0.2156 (0.3128) loss: 0.7831 (0.7834) time: 0.2679 data: 0.0001 max mem: 26157 Train: [80] [2800/6250] eta: 0:15:27 lr: 0.000013 grad: 0.1958 (0.3102) loss: 0.7824 (0.7833) time: 0.2679 data: 0.0001 max mem: 26157 Train: [80] [2900/6250] eta: 0:15:00 lr: 0.000013 grad: 0.2138 (0.3094) loss: 0.7766 (0.7831) time: 0.2685 data: 0.0001 max mem: 26157 Train: [80] [3000/6250] eta: 0:14:33 lr: 0.000013 grad: 0.1985 (0.3127) loss: 0.7767 (0.7829) time: 0.2680 data: 0.0001 max mem: 26157 Train: [80] [3100/6250] eta: 0:14:06 lr: 0.000013 grad: 0.1939 (0.3115) loss: 0.7790 (0.7827) time: 0.2684 data: 0.0001 max mem: 26157 Train: [80] [3200/6250] eta: 0:13:39 lr: 0.000013 grad: 0.1984 (0.3141) loss: 0.7798 (0.7825) time: 0.2678 data: 0.0002 max mem: 26157 Train: [80] [3300/6250] eta: 0:13:12 lr: 0.000013 grad: 0.1888 (0.3180) loss: 0.7757 (0.7823) time: 0.2672 data: 0.0001 max mem: 26157 Train: [80] [3400/6250] eta: 0:12:45 lr: 0.000012 grad: 0.1963 (0.3177) loss: 0.7824 (0.7822) time: 0.2693 data: 0.0002 max mem: 26157 Train: [80] [3500/6250] eta: 0:12:18 lr: 0.000012 grad: 0.1890 (0.3185) loss: 0.7698 (0.7820) time: 0.2690 data: 0.0002 max mem: 26157 Train: [80] [3600/6250] eta: 0:11:51 lr: 0.000012 grad: 0.1956 (0.3179) loss: 0.7796 (0.7819) time: 0.2668 data: 0.0001 max mem: 26157 Train: [80] [3700/6250] eta: 0:11:24 lr: 0.000012 grad: 0.1938 (0.3186) loss: 0.7762 (0.7819) time: 0.2661 data: 0.0001 max mem: 26157 Train: [80] [3800/6250] eta: 0:10:57 lr: 0.000012 grad: 0.2032 (0.3254) loss: 0.7804 (0.7818) time: 0.2672 data: 0.0002 max mem: 26157 Train: [80] [3900/6250] eta: 0:10:30 lr: 0.000012 grad: 0.1896 (0.3242) loss: 0.7746 (0.7818) time: 0.2666 data: 0.0001 max mem: 26157 Train: [80] [4000/6250] eta: 0:10:03 lr: 0.000012 grad: 0.2124 (0.3257) loss: 0.7850 (0.7818) time: 0.2674 data: 0.0001 max mem: 26157 Train: [80] [4100/6250] eta: 0:09:37 lr: 0.000012 grad: 0.2129 (0.3265) loss: 0.7789 (0.7818) time: 0.2664 data: 0.0001 max mem: 26157 Train: [80] [4200/6250] eta: 0:09:10 lr: 0.000012 grad: 0.2143 (0.3294) loss: 0.7840 (0.7818) time: 0.2668 data: 0.0001 max mem: 26157 Train: [80] [4300/6250] eta: 0:08:43 lr: 0.000012 grad: 0.1969 (0.3315) loss: 0.7841 (0.7818) time: 0.2677 data: 0.0001 max mem: 26157 Train: [80] [4400/6250] eta: 0:08:16 lr: 0.000012 grad: 0.1961 (0.3315) loss: 0.7933 (0.7819) time: 0.2666 data: 0.0001 max mem: 26157 Train: [80] [4500/6250] eta: 0:07:49 lr: 0.000012 grad: 0.1981 (0.3344) loss: 0.7868 (0.7819) time: 0.2665 data: 0.0001 max mem: 26157 Train: [80] [4600/6250] eta: 0:07:22 lr: 0.000012 grad: 0.1980 (0.3334) loss: 0.7776 (0.7820) time: 0.2661 data: 0.0001 max mem: 26157 Train: [80] [4700/6250] eta: 0:06:55 lr: 0.000012 grad: 0.2058 (0.3319) loss: 0.7843 (0.7819) time: 0.2667 data: 0.0001 max mem: 26157 Train: [80] [4800/6250] eta: 0:06:28 lr: 0.000012 grad: 0.1989 (0.3312) loss: 0.7864 (0.7819) time: 0.2671 data: 0.0001 max mem: 26157 Train: [80] [4900/6250] eta: 0:06:02 lr: 0.000012 grad: 0.1879 (0.3306) loss: 0.7858 (0.7820) time: 0.2663 data: 0.0002 max mem: 26157 Train: [80] [5000/6250] eta: 0:05:35 lr: 0.000012 grad: 0.1999 (0.3290) loss: 0.7782 (0.7820) time: 0.2665 data: 0.0001 max mem: 26157 Train: [80] [5100/6250] eta: 0:05:08 lr: 0.000012 grad: 0.1935 (0.3274) loss: 0.7793 (0.7819) time: 0.2660 data: 0.0001 max mem: 26157 Train: [80] [5200/6250] eta: 0:04:41 lr: 0.000012 grad: 0.1954 (0.3263) loss: 0.7865 (0.7819) time: 0.2668 data: 0.0001 max mem: 26157 Train: [80] [5300/6250] eta: 0:04:14 lr: 0.000012 grad: 0.1935 (0.3277) loss: 0.7845 (0.7820) time: 0.2667 data: 0.0001 max mem: 26157 Train: [80] [5400/6250] eta: 0:03:47 lr: 0.000012 grad: 0.1907 (0.3269) loss: 0.7799 (0.7819) time: 0.2678 data: 0.0002 max mem: 26157 Train: [80] [5500/6250] eta: 0:03:21 lr: 0.000012 grad: 0.2025 (0.3264) loss: 0.7897 (0.7819) time: 0.2673 data: 0.0002 max mem: 26157 Train: [80] [5600/6250] eta: 0:02:54 lr: 0.000012 grad: 0.1907 (0.3264) loss: 0.7791 (0.7820) time: 0.2680 data: 0.0002 max mem: 26157 Train: [80] [5700/6250] eta: 0:02:27 lr: 0.000012 grad: 0.1935 (0.3258) loss: 0.7827 (0.7819) time: 0.2675 data: 0.0001 max mem: 26157 Train: [80] [5800/6250] eta: 0:02:00 lr: 0.000012 grad: 0.1963 (0.3258) loss: 0.7862 (0.7820) time: 0.2668 data: 0.0001 max mem: 26157 Train: [80] [5900/6250] eta: 0:01:33 lr: 0.000012 grad: 0.1946 (0.3247) loss: 0.7767 (0.7819) time: 0.2671 data: 0.0001 max mem: 26157 Train: [80] [6000/6250] eta: 0:01:07 lr: 0.000012 grad: 0.2042 (0.3267) loss: 0.7814 (0.7819) time: 0.2661 data: 0.0002 max mem: 26157 Train: [80] [6100/6250] eta: 0:00:40 lr: 0.000012 grad: 0.2001 (0.3261) loss: 0.7769 (0.7819) time: 0.2676 data: 0.0001 max mem: 26157 Train: [80] [6200/6250] eta: 0:00:13 lr: 0.000012 grad: 0.2021 (0.3258) loss: 0.7824 (0.7818) time: 0.2679 data: 0.0001 max mem: 26157 Train: [80] [6249/6250] eta: 0:00:00 lr: 0.000012 grad: 0.2020 (0.3259) loss: 0.7747 (0.7818) time: 0.2668 data: 0.0001 max mem: 26157 Train: [80] Total time: 0:27:58 (0.2685 s / it) Averaged stats: lr: 0.000012 grad: 0.2020 (0.3259) loss: 0.7747 (0.7818) Eval (hcp-train-subset): [80] [ 0/62] eta: 0:03:25 loss: 0.8126 (0.8126) time: 3.3075 data: 3.2103 max mem: 26157 Eval (hcp-train-subset): [80] [61/62] eta: 0:00:00 loss: 0.8004 (0.8009) time: 0.0824 data: 0.0001 max mem: 26157 Eval (hcp-train-subset): [80] Total time: 0:00:10 (0.1691 s / it) Averaged stats (hcp-train-subset): loss: 0.8004 (0.8009) Making plots (hcp-train-subset): example=5 Eval (hcp-val): [80] [ 0/62] eta: 0:02:34 loss: 0.8205 (0.8205) time: 2.4885 data: 2.3935 max mem: 26157 Eval (hcp-val): [80] [61/62] eta: 0:00:00 loss: 0.8231 (0.8240) time: 0.0838 data: 0.0001 max mem: 26157 Eval (hcp-val): [80] Total time: 0:00:10 (0.1660 s / it) Averaged stats (hcp-val): loss: 0.8231 (0.8240) Making plots (hcp-val): example=58 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [81] [ 0/6250] eta: 6:38:27 lr: 0.000012 grad: 0.5728 (0.5728) loss: 0.8052 (0.8052) time: 3.8252 data: 3.5196 max mem: 26157 Train: [81] [ 100/6250] eta: 0:32:15 lr: 0.000012 grad: 0.2515 (0.3394) loss: 0.7844 (0.7893) time: 0.2663 data: 0.0001 max mem: 26157 Train: [81] [ 200/6250] eta: 0:29:19 lr: 0.000012 grad: 0.2061 (0.3344) loss: 0.7958 (0.7902) time: 0.2669 data: 0.0001 max mem: 26157 Train: [81] [ 300/6250] eta: 0:28:04 lr: 0.000012 grad: 0.1983 (0.2991) loss: 0.7889 (0.7895) time: 0.2679 data: 0.0002 max mem: 26157 Train: [81] [ 400/6250] eta: 0:27:12 lr: 0.000012 grad: 0.1956 (0.2919) loss: 0.7868 (0.7886) time: 0.2674 data: 0.0001 max mem: 26157 Train: [81] [ 500/6250] eta: 0:26:31 lr: 0.000012 grad: 0.1852 (0.2980) loss: 0.7875 (0.7881) time: 0.2677 data: 0.0001 max mem: 26157 Train: [81] [ 600/6250] eta: 0:25:55 lr: 0.000012 grad: 0.2011 (0.3080) loss: 0.7829 (0.7877) time: 0.2664 data: 0.0001 max mem: 26157 Train: [81] [ 700/6250] eta: 0:25:22 lr: 0.000012 grad: 0.1967 (0.3021) loss: 0.7915 (0.7876) time: 0.2667 data: 0.0001 max mem: 26157 Train: [81] [ 800/6250] eta: 0:24:50 lr: 0.000012 grad: 0.1906 (0.3058) loss: 0.7794 (0.7871) time: 0.2666 data: 0.0001 max mem: 26157 Train: [81] [ 900/6250] eta: 0:24:19 lr: 0.000012 grad: 0.1923 (0.3063) loss: 0.7855 (0.7866) time: 0.2686 data: 0.0002 max mem: 26157 Train: [81] [1000/6250] eta: 0:23:49 lr: 0.000012 grad: 0.1971 (0.3023) loss: 0.7802 (0.7858) time: 0.2667 data: 0.0001 max mem: 26157 Train: [81] [1100/6250] eta: 0:23:20 lr: 0.000012 grad: 0.2008 (0.3093) loss: 0.7718 (0.7851) time: 0.2677 data: 0.0002 max mem: 26157 Train: [81] [1200/6250] eta: 0:22:50 lr: 0.000012 grad: 0.1908 (0.3112) loss: 0.7776 (0.7842) time: 0.2669 data: 0.0001 max mem: 26157 Train: [81] [1300/6250] eta: 0:22:22 lr: 0.000012 grad: 0.1913 (0.3092) loss: 0.7815 (0.7838) time: 0.2674 data: 0.0001 max mem: 26157 Train: [81] [1400/6250] eta: 0:21:53 lr: 0.000012 grad: 0.1995 (0.3144) loss: 0.7769 (0.7832) time: 0.2666 data: 0.0001 max mem: 26157 Train: [81] [1500/6250] eta: 0:21:25 lr: 0.000012 grad: 0.1867 (0.3133) loss: 0.7799 (0.7828) time: 0.2676 data: 0.0001 max mem: 26157 Train: [81] [1600/6250] eta: 0:20:57 lr: 0.000012 grad: 0.1940 (0.3118) loss: 0.7765 (0.7825) time: 0.2673 data: 0.0001 max mem: 26157 Train: [81] [1700/6250] eta: 0:20:29 lr: 0.000012 grad: 0.2057 (0.3151) loss: 0.7846 (0.7821) time: 0.2670 data: 0.0001 max mem: 26157 Train: [81] [1800/6250] eta: 0:20:01 lr: 0.000012 grad: 0.1871 (0.3234) loss: 0.7696 (0.7819) time: 0.2666 data: 0.0001 max mem: 26157 Train: [81] [1900/6250] eta: 0:19:33 lr: 0.000012 grad: 0.2024 (0.3250) loss: 0.7754 (0.7816) time: 0.2670 data: 0.0001 max mem: 26157 Train: [81] [2000/6250] eta: 0:19:06 lr: 0.000012 grad: 0.2150 (0.3232) loss: 0.7735 (0.7812) time: 0.2660 data: 0.0001 max mem: 26157 Train: [81] [2100/6250] eta: 0:18:38 lr: 0.000012 grad: 0.2056 (0.3209) loss: 0.7796 (0.7810) time: 0.2666 data: 0.0001 max mem: 26157 Train: [81] [2200/6250] eta: 0:18:11 lr: 0.000012 grad: 0.1969 (0.3192) loss: 0.7714 (0.7808) time: 0.2668 data: 0.0001 max mem: 26157 Train: [81] [2300/6250] eta: 0:17:43 lr: 0.000011 grad: 0.1975 (0.3230) loss: 0.7814 (0.7807) time: 0.2665 data: 0.0001 max mem: 26157 Train: [81] [2400/6250] eta: 0:17:16 lr: 0.000011 grad: 0.2124 (0.3276) loss: 0.7780 (0.7806) time: 0.2668 data: 0.0002 max mem: 26157 Train: [81] [2500/6250] eta: 0:16:49 lr: 0.000011 grad: 0.1971 (0.3266) loss: 0.7777 (0.7806) time: 0.2663 data: 0.0002 max mem: 26157 Train: [81] [2600/6250] eta: 0:16:22 lr: 0.000011 grad: 0.1890 (0.3269) loss: 0.7732 (0.7805) time: 0.2677 data: 0.0001 max mem: 26157 Train: [81] [2700/6250] eta: 0:15:54 lr: 0.000011 grad: 0.1852 (0.3273) loss: 0.7828 (0.7805) time: 0.2663 data: 0.0001 max mem: 26157 Train: [81] [2800/6250] eta: 0:15:27 lr: 0.000011 grad: 0.2031 (0.3265) loss: 0.7812 (0.7805) time: 0.2665 data: 0.0001 max mem: 26157 Train: [81] [2900/6250] eta: 0:15:00 lr: 0.000011 grad: 0.2087 (0.3316) loss: 0.7778 (0.7804) time: 0.2671 data: 0.0001 max mem: 26157 Train: [81] [3000/6250] eta: 0:14:33 lr: 0.000011 grad: 0.1906 (0.3361) loss: 0.7892 (0.7805) time: 0.2661 data: 0.0001 max mem: 26157 Train: [81] [3100/6250] eta: 0:14:06 lr: 0.000011 grad: 0.1936 (0.3337) loss: 0.7805 (0.7806) time: 0.2671 data: 0.0002 max mem: 26157 Train: [81] [3200/6250] eta: 0:13:39 lr: 0.000011 grad: 0.1840 (0.3323) loss: 0.7878 (0.7807) time: 0.2667 data: 0.0001 max mem: 26157 Train: [81] [3300/6250] eta: 0:13:12 lr: 0.000011 grad: 0.2005 (0.3299) loss: 0.7847 (0.7808) time: 0.2673 data: 0.0002 max mem: 26157 Train: [81] [3400/6250] eta: 0:12:45 lr: 0.000011 grad: 0.1959 (0.3291) loss: 0.7847 (0.7810) time: 0.2671 data: 0.0001 max mem: 26157 Train: [81] [3500/6250] eta: 0:12:18 lr: 0.000011 grad: 0.1923 (0.3361) loss: 0.7912 (0.7811) time: 0.2671 data: 0.0002 max mem: 26157 Train: [81] [3600/6250] eta: 0:11:51 lr: 0.000011 grad: 0.1881 (0.3333) loss: 0.7845 (0.7812) time: 0.2668 data: 0.0001 max mem: 26157 Train: [81] [3700/6250] eta: 0:11:24 lr: 0.000011 grad: 0.1816 (0.3324) loss: 0.7815 (0.7813) time: 0.2666 data: 0.0001 max mem: 26157 Train: [81] [3800/6250] eta: 0:10:57 lr: 0.000011 grad: 0.1893 (0.3302) loss: 0.7854 (0.7813) time: 0.2673 data: 0.0002 max mem: 26157 Train: [81] [3900/6250] eta: 0:10:30 lr: 0.000011 grad: 0.1936 (0.3354) loss: 0.7848 (0.7814) time: 0.2662 data: 0.0001 max mem: 26157 Train: [81] [4000/6250] eta: 0:10:03 lr: 0.000011 grad: 0.1986 (0.3347) loss: 0.7824 (0.7814) time: 0.2671 data: 0.0001 max mem: 26157 Train: [81] [4100/6250] eta: 0:09:36 lr: 0.000011 grad: 0.1897 (0.3361) loss: 0.7822 (0.7815) time: 0.2665 data: 0.0001 max mem: 26157 Train: [81] [4200/6250] eta: 0:09:10 lr: 0.000011 grad: 0.1947 (0.3368) loss: 0.7852 (0.7816) time: 0.2668 data: 0.0001 max mem: 26157 Train: [81] [4300/6250] eta: 0:08:43 lr: 0.000011 grad: 0.2054 (0.3382) loss: 0.7809 (0.7817) time: 0.2664 data: 0.0001 max mem: 26157 Train: [81] [4400/6250] eta: 0:08:16 lr: 0.000011 grad: 0.2011 (0.3368) loss: 0.7824 (0.7818) time: 0.2677 data: 0.0001 max mem: 26157 Train: [81] [4500/6250] eta: 0:07:49 lr: 0.000011 grad: 0.1886 (0.3379) loss: 0.7833 (0.7818) time: 0.2669 data: 0.0001 max mem: 26157 Train: [81] [4600/6250] eta: 0:07:22 lr: 0.000011 grad: 0.1966 (0.3374) loss: 0.7803 (0.7819) time: 0.2667 data: 0.0001 max mem: 26157 Train: [81] [4700/6250] eta: 0:06:55 lr: 0.000011 grad: 0.2086 (0.3403) loss: 0.7803 (0.7818) time: 0.2666 data: 0.0001 max mem: 26157 Train: [81] [4800/6250] eta: 0:06:28 lr: 0.000011 grad: 0.1897 (0.3389) loss: 0.7770 (0.7818) time: 0.2664 data: 0.0001 max mem: 26157 Train: [81] [4900/6250] eta: 0:06:01 lr: 0.000011 grad: 0.2030 (0.3405) loss: 0.7802 (0.7817) time: 0.2664 data: 0.0002 max mem: 26157 Train: [81] [5000/6250] eta: 0:05:35 lr: 0.000011 grad: 0.1912 (0.3402) loss: 0.7777 (0.7816) time: 0.2683 data: 0.0001 max mem: 26157 Train: [81] [5100/6250] eta: 0:05:08 lr: 0.000011 grad: 0.1957 (0.3423) loss: 0.7758 (0.7816) time: 0.2668 data: 0.0001 max mem: 26157 Train: [81] [5200/6250] eta: 0:04:41 lr: 0.000011 grad: 0.2047 (0.3438) loss: 0.7680 (0.7815) time: 0.2667 data: 0.0001 max mem: 26157 Train: [81] [5300/6250] eta: 0:04:14 lr: 0.000011 grad: 0.2037 (0.3473) loss: 0.7681 (0.7814) time: 0.2668 data: 0.0001 max mem: 26157 Train: [81] [5400/6250] eta: 0:03:47 lr: 0.000011 grad: 0.1897 (0.3500) loss: 0.7806 (0.7812) time: 0.2670 data: 0.0001 max mem: 26157 Train: [81] [5500/6250] eta: 0:03:20 lr: 0.000011 grad: 0.2099 (0.3491) loss: 0.7734 (0.7811) time: 0.2667 data: 0.0001 max mem: 26157 Train: [81] [5600/6250] eta: 0:02:54 lr: 0.000011 grad: 0.1897 (0.3488) loss: 0.7743 (0.7811) time: 0.2668 data: 0.0001 max mem: 26157 Train: [81] [5700/6250] eta: 0:02:27 lr: 0.000011 grad: 0.1978 (0.3480) loss: 0.7715 (0.7810) time: 0.2700 data: 0.0002 max mem: 26157 Train: [81] [5800/6250] eta: 0:02:00 lr: 0.000011 grad: 0.1916 (0.3502) loss: 0.7761 (0.7810) time: 0.2703 data: 0.0002 max mem: 26157 Train: [81] [5900/6250] eta: 0:01:33 lr: 0.000011 grad: 0.2027 (0.3494) loss: 0.7754 (0.7809) time: 0.2675 data: 0.0001 max mem: 26157 Train: [81] [6000/6250] eta: 0:01:06 lr: 0.000011 grad: 0.2094 (0.3493) loss: 0.7729 (0.7807) time: 0.2674 data: 0.0001 max mem: 26157 Train: [81] [6100/6250] eta: 0:00:40 lr: 0.000011 grad: 0.1981 (0.3508) loss: 0.7746 (0.7806) time: 0.2670 data: 0.0001 max mem: 26157 Train: [81] [6200/6250] eta: 0:00:13 lr: 0.000011 grad: 0.2201 (0.3527) loss: 0.7661 (0.7805) time: 0.2724 data: 0.0002 max mem: 26157 Train: [81] [6249/6250] eta: 0:00:00 lr: 0.000011 grad: 0.1923 (0.3523) loss: 0.7753 (0.7804) time: 0.2666 data: 0.0001 max mem: 26157 Train: [81] Total time: 0:27:57 (0.2684 s / it) Averaged stats: lr: 0.000011 grad: 0.1923 (0.3523) loss: 0.7753 (0.7804) Eval (hcp-train-subset): [81] [ 0/62] eta: 0:04:41 loss: 0.8159 (0.8159) time: 4.5417 data: 4.4584 max mem: 26157 Eval (hcp-train-subset): [81] [61/62] eta: 0:00:00 loss: 0.7977 (0.7979) time: 0.0890 data: 0.0066 max mem: 26157 Eval (hcp-train-subset): [81] Total time: 0:00:10 (0.1705 s / it) Averaged stats (hcp-train-subset): loss: 0.7977 (0.7979) Making plots (hcp-train-subset): example=38 Eval (hcp-val): [81] [ 0/62] eta: 0:03:33 loss: 0.8216 (0.8216) time: 3.4404 data: 3.3241 max mem: 26157 Eval (hcp-val): [81] [61/62] eta: 0:00:00 loss: 0.8232 (0.8240) time: 0.1039 data: 0.0215 max mem: 26157 Eval (hcp-val): [81] Total time: 0:00:11 (0.1887 s / it) Averaged stats (hcp-val): loss: 0.8232 (0.8240) Making plots (hcp-val): example=7 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [82] [ 0/6250] eta: 6:16:27 lr: 0.000011 grad: 0.2058 (0.2058) loss: 0.8298 (0.8298) time: 3.6139 data: 3.2820 max mem: 26157 Train: [82] [ 100/6250] eta: 0:32:04 lr: 0.000011 grad: 0.2421 (0.4366) loss: 0.7964 (0.7944) time: 0.2674 data: 0.0001 max mem: 26157 Train: [82] [ 200/6250] eta: 0:29:16 lr: 0.000011 grad: 0.2268 (0.3776) loss: 0.7837 (0.7895) time: 0.2682 data: 0.0002 max mem: 26157 Train: [82] [ 300/6250] eta: 0:28:02 lr: 0.000011 grad: 0.2168 (0.3457) loss: 0.7763 (0.7885) time: 0.2678 data: 0.0002 max mem: 26157 Train: [82] [ 400/6250] eta: 0:27:11 lr: 0.000011 grad: 0.2127 (0.3229) loss: 0.7890 (0.7878) time: 0.2671 data: 0.0002 max mem: 26157 Train: [82] [ 500/6250] eta: 0:26:30 lr: 0.000011 grad: 0.2063 (0.3157) loss: 0.7783 (0.7878) time: 0.2676 data: 0.0002 max mem: 26157 Train: [82] [ 600/6250] eta: 0:25:54 lr: 0.000011 grad: 0.1894 (0.3144) loss: 0.7966 (0.7877) time: 0.2667 data: 0.0001 max mem: 26157 Train: [82] [ 700/6250] eta: 0:25:20 lr: 0.000011 grad: 0.1922 (0.3148) loss: 0.7803 (0.7874) time: 0.2667 data: 0.0001 max mem: 26157 Train: [82] [ 800/6250] eta: 0:24:48 lr: 0.000011 grad: 0.2166 (0.3212) loss: 0.7803 (0.7865) time: 0.2668 data: 0.0002 max mem: 26157 Train: [82] [ 900/6250] eta: 0:24:17 lr: 0.000011 grad: 0.1925 (0.3197) loss: 0.7816 (0.7860) time: 0.2677 data: 0.0002 max mem: 26157 Train: [82] [1000/6250] eta: 0:23:47 lr: 0.000011 grad: 0.2051 (0.3371) loss: 0.7749 (0.7857) time: 0.2670 data: 0.0002 max mem: 26157 Train: [82] [1100/6250] eta: 0:23:17 lr: 0.000011 grad: 0.1974 (0.3426) loss: 0.7810 (0.7854) time: 0.2664 data: 0.0002 max mem: 26157 Train: [82] [1200/6250] eta: 0:22:48 lr: 0.000011 grad: 0.3991 (0.3687) loss: 0.7874 (0.7852) time: 0.2669 data: 0.0002 max mem: 26157 Train: [82] [1300/6250] eta: 0:22:22 lr: 0.000011 grad: 0.1972 (0.3685) loss: 0.7832 (0.7851) time: 0.2658 data: 0.0001 max mem: 26157 Train: [82] [1400/6250] eta: 0:21:53 lr: 0.000010 grad: 0.1991 (0.3677) loss: 0.7800 (0.7848) time: 0.2673 data: 0.0001 max mem: 26157 Train: [82] [1500/6250] eta: 0:21:25 lr: 0.000010 grad: 0.2075 (0.3614) loss: 0.7771 (0.7844) time: 0.2672 data: 0.0001 max mem: 26157 Train: [82] [1600/6250] eta: 0:20:57 lr: 0.000010 grad: 0.1945 (0.3544) loss: 0.7811 (0.7842) time: 0.2664 data: 0.0002 max mem: 26157 Train: [82] [1700/6250] eta: 0:20:29 lr: 0.000010 grad: 0.2113 (0.3543) loss: 0.7809 (0.7839) time: 0.2670 data: 0.0001 max mem: 26157 Train: [82] [1800/6250] eta: 0:20:01 lr: 0.000010 grad: 0.1977 (0.3558) loss: 0.7785 (0.7837) time: 0.2672 data: 0.0001 max mem: 26157 Train: [82] [1900/6250] eta: 0:19:33 lr: 0.000010 grad: 0.2064 (0.3638) loss: 0.7811 (0.7835) time: 0.2671 data: 0.0001 max mem: 26157 Train: [82] [2000/6250] eta: 0:19:06 lr: 0.000010 grad: 0.2146 (0.3616) loss: 0.7775 (0.7832) time: 0.2673 data: 0.0001 max mem: 26157 Train: [82] [2100/6250] eta: 0:18:38 lr: 0.000010 grad: 0.2131 (0.3600) loss: 0.7777 (0.7830) time: 0.2670 data: 0.0001 max mem: 26157 Train: [82] [2200/6250] eta: 0:18:11 lr: 0.000010 grad: 0.2021 (0.3585) loss: 0.7759 (0.7827) time: 0.2676 data: 0.0001 max mem: 26157 Train: [82] [2300/6250] eta: 0:17:44 lr: 0.000010 grad: 0.1938 (0.3547) loss: 0.7733 (0.7824) time: 0.2671 data: 0.0001 max mem: 26157 Train: [82] [2400/6250] eta: 0:17:16 lr: 0.000010 grad: 0.1964 (0.3581) loss: 0.7741 (0.7823) time: 0.2667 data: 0.0001 max mem: 26157 Train: [82] [2500/6250] eta: 0:16:49 lr: 0.000010 grad: 0.1915 (0.3591) loss: 0.7819 (0.7822) time: 0.2670 data: 0.0001 max mem: 26157 Train: [82] [2600/6250] eta: 0:16:22 lr: 0.000010 grad: 0.2027 (0.3605) loss: 0.7778 (0.7821) time: 0.2671 data: 0.0001 max mem: 26157 Train: [82] [2700/6250] eta: 0:15:55 lr: 0.000010 grad: 0.2136 (0.3613) loss: 0.7707 (0.7820) time: 0.2666 data: 0.0001 max mem: 26157 Train: [82] [2800/6250] eta: 0:15:27 lr: 0.000010 grad: 0.1938 (0.3590) loss: 0.7848 (0.7819) time: 0.2672 data: 0.0001 max mem: 26157 Train: [82] [2900/6250] eta: 0:15:00 lr: 0.000010 grad: 0.2039 (0.3590) loss: 0.7748 (0.7818) time: 0.2677 data: 0.0001 max mem: 26157 Train: [82] [3000/6250] eta: 0:14:33 lr: 0.000010 grad: 0.2103 (0.3579) loss: 0.7746 (0.7817) time: 0.2668 data: 0.0001 max mem: 26157 Train: [82] [3100/6250] eta: 0:14:06 lr: 0.000010 grad: 0.1926 (0.3564) loss: 0.7796 (0.7815) time: 0.2671 data: 0.0001 max mem: 26157 Train: [82] [3200/6250] eta: 0:13:39 lr: 0.000010 grad: 0.1981 (0.3555) loss: 0.7646 (0.7812) time: 0.2662 data: 0.0001 max mem: 26157 Train: [82] [3300/6250] eta: 0:13:12 lr: 0.000010 grad: 0.1926 (0.3574) loss: 0.7764 (0.7810) time: 0.2669 data: 0.0001 max mem: 26157 Train: [82] [3400/6250] eta: 0:12:45 lr: 0.000010 grad: 0.1883 (0.3570) loss: 0.7838 (0.7809) time: 0.2669 data: 0.0001 max mem: 26157 Train: [82] [3500/6250] eta: 0:12:18 lr: 0.000010 grad: 0.2296 (0.3580) loss: 0.7722 (0.7807) time: 0.2659 data: 0.0001 max mem: 26157 Train: [82] [3600/6250] eta: 0:11:51 lr: 0.000010 grad: 0.1927 (0.3585) loss: 0.7770 (0.7806) time: 0.2679 data: 0.0001 max mem: 26157 Train: [82] [3700/6250] eta: 0:11:24 lr: 0.000010 grad: 0.1986 (0.3583) loss: 0.7801 (0.7806) time: 0.2663 data: 0.0001 max mem: 26157 Train: [82] [3800/6250] eta: 0:10:57 lr: 0.000010 grad: 0.1908 (0.3568) loss: 0.7810 (0.7806) time: 0.2666 data: 0.0001 max mem: 26157 Train: [82] [3900/6250] eta: 0:10:30 lr: 0.000010 grad: 0.1931 (0.3556) loss: 0.7844 (0.7805) time: 0.2684 data: 0.0002 max mem: 26157 Train: [82] [4000/6250] eta: 0:10:03 lr: 0.000010 grad: 0.2066 (0.3556) loss: 0.7732 (0.7804) time: 0.2662 data: 0.0001 max mem: 26157 Train: [82] [4100/6250] eta: 0:09:36 lr: 0.000010 grad: 0.2049 (0.3583) loss: 0.7806 (0.7803) time: 0.2665 data: 0.0001 max mem: 26157 Train: [82] [4200/6250] eta: 0:09:10 lr: 0.000010 grad: 0.2117 (0.3599) loss: 0.7771 (0.7803) time: 0.2665 data: 0.0001 max mem: 26157 Train: [82] [4300/6250] eta: 0:08:43 lr: 0.000010 grad: 0.2161 (0.3572) loss: 0.7719 (0.7802) time: 0.2667 data: 0.0001 max mem: 26157 Train: [82] [4400/6250] eta: 0:08:16 lr: 0.000010 grad: 0.2136 (0.3589) loss: 0.7713 (0.7801) time: 0.2664 data: 0.0001 max mem: 26157 Train: [82] [4500/6250] eta: 0:07:49 lr: 0.000010 grad: 0.1970 (0.3595) loss: 0.7783 (0.7800) time: 0.2670 data: 0.0001 max mem: 26157 Train: [82] [4600/6250] eta: 0:07:22 lr: 0.000010 grad: 0.1979 (0.3588) loss: 0.7721 (0.7800) time: 0.2688 data: 0.0002 max mem: 26157 Train: [82] [4700/6250] eta: 0:06:55 lr: 0.000010 grad: 0.2061 (0.3586) loss: 0.7728 (0.7799) time: 0.2665 data: 0.0001 max mem: 26157 Train: [82] [4800/6250] eta: 0:06:28 lr: 0.000010 grad: 0.2032 (0.3612) loss: 0.7748 (0.7798) time: 0.2665 data: 0.0001 max mem: 26157 Train: [82] [4900/6250] eta: 0:06:01 lr: 0.000010 grad: 0.2083 (0.3641) loss: 0.7856 (0.7798) time: 0.2665 data: 0.0001 max mem: 26157 Train: [82] [5000/6250] eta: 0:05:35 lr: 0.000010 grad: 0.2074 (0.3684) loss: 0.7794 (0.7798) time: 0.2670 data: 0.0001 max mem: 26157 Train: [82] [5100/6250] eta: 0:05:08 lr: 0.000010 grad: 0.2043 (0.3694) loss: 0.7790 (0.7798) time: 0.2665 data: 0.0002 max mem: 26157 Train: [82] [5200/6250] eta: 0:04:41 lr: 0.000010 grad: 0.2129 (0.3691) loss: 0.7766 (0.7797) time: 0.2670 data: 0.0001 max mem: 26157 Train: [82] [5300/6250] eta: 0:04:14 lr: 0.000010 grad: 0.2010 (0.3708) loss: 0.7796 (0.7797) time: 0.2674 data: 0.0001 max mem: 26157 Train: [82] [5400/6250] eta: 0:03:47 lr: 0.000010 grad: 0.2019 (0.3712) loss: 0.7763 (0.7797) time: 0.2667 data: 0.0001 max mem: 26157 Train: [82] [5500/6250] eta: 0:03:20 lr: 0.000010 grad: 0.2040 (0.3725) loss: 0.7838 (0.7797) time: 0.2679 data: 0.0001 max mem: 26157 Train: [82] [5600/6250] eta: 0:02:54 lr: 0.000010 grad: 0.2233 (0.3753) loss: 0.7795 (0.7796) time: 0.2680 data: 0.0001 max mem: 26157 Train: [82] [5700/6250] eta: 0:02:27 lr: 0.000010 grad: 0.2026 (0.3795) loss: 0.7767 (0.7797) time: 0.2667 data: 0.0002 max mem: 26157 Train: [82] [5800/6250] eta: 0:02:00 lr: 0.000010 grad: 0.1896 (0.3795) loss: 0.7897 (0.7798) time: 0.2671 data: 0.0002 max mem: 26157 Train: [82] [5900/6250] eta: 0:01:33 lr: 0.000010 grad: 0.1951 (0.3792) loss: 0.7726 (0.7799) time: 0.2680 data: 0.0001 max mem: 26157 Train: [82] [6000/6250] eta: 0:01:06 lr: 0.000010 grad: 0.1995 (0.3824) loss: 0.7804 (0.7799) time: 0.2667 data: 0.0001 max mem: 26157 Train: [82] [6100/6250] eta: 0:00:40 lr: 0.000010 grad: 0.2009 (0.3838) loss: 0.7825 (0.7799) time: 0.2667 data: 0.0001 max mem: 26157 Train: [82] [6200/6250] eta: 0:00:13 lr: 0.000010 grad: 0.2084 (0.3817) loss: 0.7749 (0.7799) time: 0.2665 data: 0.0001 max mem: 26157 Train: [82] [6249/6250] eta: 0:00:00 lr: 0.000010 grad: 0.1986 (0.3804) loss: 0.7811 (0.7800) time: 0.2669 data: 0.0001 max mem: 26157 Train: [82] Total time: 0:27:57 (0.2684 s / it) Averaged stats: lr: 0.000010 grad: 0.1986 (0.3804) loss: 0.7811 (0.7800) Eval (hcp-train-subset): [82] [ 0/62] eta: 0:03:24 loss: 0.8077 (0.8077) time: 3.2951 data: 3.2024 max mem: 26157 Eval (hcp-train-subset): [82] [61/62] eta: 0:00:00 loss: 0.7924 (0.7955) time: 0.0946 data: 0.0124 max mem: 26157 Eval (hcp-train-subset): [82] Total time: 0:00:10 (0.1643 s / it) Averaged stats (hcp-train-subset): loss: 0.7924 (0.7955) Making plots (hcp-train-subset): example=25 Eval (hcp-val): [82] [ 0/62] eta: 0:04:14 loss: 0.8175 (0.8175) time: 4.1091 data: 4.0270 max mem: 26157 Eval (hcp-val): [82] [61/62] eta: 0:00:00 loss: 0.8226 (0.8235) time: 0.0824 data: 0.0001 max mem: 26157 Eval (hcp-val): [82] Total time: 0:00:10 (0.1631 s / it) Averaged stats (hcp-val): loss: 0.8226 (0.8235) Making plots (hcp-val): example=38 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [83] [ 0/6250] eta: 6:47:59 lr: 0.000010 grad: 0.5127 (0.5127) loss: 0.8106 (0.8106) time: 3.9167 data: 3.5920 max mem: 26157 Train: [83] [ 100/6250] eta: 0:32:00 lr: 0.000010 grad: 0.2721 (0.4467) loss: 0.7994 (0.7963) time: 0.2674 data: 0.0001 max mem: 26157 Train: [83] [ 200/6250] eta: 0:29:12 lr: 0.000010 grad: 0.2110 (0.4739) loss: 0.7882 (0.7895) time: 0.2659 data: 0.0001 max mem: 26157 Train: [83] [ 300/6250] eta: 0:27:58 lr: 0.000010 grad: 0.2374 (0.4408) loss: 0.7802 (0.7863) time: 0.2677 data: 0.0001 max mem: 26157 Train: [83] [ 400/6250] eta: 0:27:09 lr: 0.000010 grad: 0.2450 (0.4961) loss: 0.7778 (0.7840) time: 0.2668 data: 0.0001 max mem: 26157 Train: [83] [ 500/6250] eta: 0:26:28 lr: 0.000010 grad: 0.2317 (0.5609) loss: 0.7803 (0.7822) time: 0.2668 data: 0.0001 max mem: 26157 Train: [83] [ 600/6250] eta: 0:25:52 lr: 0.000010 grad: 0.2644 (0.5677) loss: 0.7803 (0.7816) time: 0.2667 data: 0.0001 max mem: 26157 Train: [83] [ 700/6250] eta: 0:25:18 lr: 0.000009 grad: 0.2221 (0.5869) loss: 0.7755 (0.7811) time: 0.2676 data: 0.0001 max mem: 26157 Train: [83] [ 800/6250] eta: 0:24:47 lr: 0.000009 grad: 0.2129 (0.5759) loss: 0.7772 (0.7810) time: 0.2678 data: 0.0001 max mem: 26157 Train: [83] [ 900/6250] eta: 0:24:16 lr: 0.000009 grad: 0.2027 (0.5566) loss: 0.7825 (0.7808) time: 0.2672 data: 0.0001 max mem: 26157 Train: [83] [1000/6250] eta: 0:23:46 lr: 0.000009 grad: 0.2064 (0.5498) loss: 0.7889 (0.7807) time: 0.2678 data: 0.0002 max mem: 26157 Train: [83] [1100/6250] eta: 0:23:17 lr: 0.000009 grad: 0.2173 (0.5335) loss: 0.7805 (0.7807) time: 0.2671 data: 0.0002 max mem: 26157 Train: [83] [1200/6250] eta: 0:22:48 lr: 0.000009 grad: 0.1935 (0.5155) loss: 0.7866 (0.7808) time: 0.2669 data: 0.0001 max mem: 26157 Train: [83] [1300/6250] eta: 0:22:19 lr: 0.000009 grad: 0.1943 (0.5053) loss: 0.7772 (0.7807) time: 0.2670 data: 0.0001 max mem: 26157 Train: [83] [1400/6250] eta: 0:21:51 lr: 0.000009 grad: 0.1847 (0.4997) loss: 0.7930 (0.7809) time: 0.2671 data: 0.0001 max mem: 26157 Train: [83] [1500/6250] eta: 0:21:23 lr: 0.000009 grad: 0.1968 (0.4994) loss: 0.7771 (0.7809) time: 0.2662 data: 0.0001 max mem: 26157 Train: [83] [1600/6250] eta: 0:20:55 lr: 0.000009 grad: 0.1931 (0.4835) loss: 0.7726 (0.7809) time: 0.2678 data: 0.0002 max mem: 26157 Train: [83] [1700/6250] eta: 0:20:27 lr: 0.000009 grad: 0.2014 (0.4781) loss: 0.7786 (0.7809) time: 0.2664 data: 0.0001 max mem: 26157 Train: [83] [1800/6250] eta: 0:19:59 lr: 0.000009 grad: 0.2014 (0.4711) loss: 0.7795 (0.7808) time: 0.2665 data: 0.0001 max mem: 26157 Train: [83] [1900/6250] eta: 0:19:32 lr: 0.000009 grad: 0.1905 (0.4633) loss: 0.7797 (0.7809) time: 0.2675 data: 0.0002 max mem: 26157 Train: [83] [2000/6250] eta: 0:19:04 lr: 0.000009 grad: 0.1882 (0.4533) loss: 0.7741 (0.7809) time: 0.2674 data: 0.0002 max mem: 26157 Train: [83] [2100/6250] eta: 0:18:37 lr: 0.000009 grad: 0.1863 (0.4508) loss: 0.7874 (0.7810) time: 0.2663 data: 0.0001 max mem: 26157 Train: [83] [2200/6250] eta: 0:18:10 lr: 0.000009 grad: 0.1921 (0.4447) loss: 0.7755 (0.7810) time: 0.2675 data: 0.0002 max mem: 26157 Train: [83] [2300/6250] eta: 0:17:42 lr: 0.000009 grad: 0.1935 (0.4377) loss: 0.7803 (0.7810) time: 0.2679 data: 0.0002 max mem: 26157 Train: [83] [2400/6250] eta: 0:17:15 lr: 0.000009 grad: 0.1935 (0.4323) loss: 0.7751 (0.7811) time: 0.2672 data: 0.0001 max mem: 26157 Train: [83] [2500/6250] eta: 0:16:48 lr: 0.000009 grad: 0.1771 (0.4308) loss: 0.7843 (0.7812) time: 0.2663 data: 0.0001 max mem: 26157 Train: [83] [2600/6250] eta: 0:16:21 lr: 0.000009 grad: 0.1949 (0.4247) loss: 0.7771 (0.7812) time: 0.2664 data: 0.0001 max mem: 26157 Train: [83] [2700/6250] eta: 0:15:54 lr: 0.000009 grad: 0.1873 (0.4226) loss: 0.7877 (0.7812) time: 0.2676 data: 0.0001 max mem: 26157 Train: [83] [2800/6250] eta: 0:15:26 lr: 0.000009 grad: 0.1863 (0.4175) loss: 0.7860 (0.7814) time: 0.2663 data: 0.0001 max mem: 26157 Train: [83] [2900/6250] eta: 0:14:59 lr: 0.000009 grad: 0.1882 (0.4124) loss: 0.7821 (0.7815) time: 0.2671 data: 0.0001 max mem: 26157 Train: [83] [3000/6250] eta: 0:14:32 lr: 0.000009 grad: 0.2060 (0.4118) loss: 0.7788 (0.7816) time: 0.2669 data: 0.0001 max mem: 26157 Train: [83] [3100/6250] eta: 0:14:05 lr: 0.000009 grad: 0.2187 (0.4097) loss: 0.7746 (0.7816) time: 0.2668 data: 0.0001 max mem: 26157 Train: [83] [3200/6250] eta: 0:13:38 lr: 0.000009 grad: 0.1979 (0.4048) loss: 0.7813 (0.7815) time: 0.2666 data: 0.0002 max mem: 26157 Train: [83] [3300/6250] eta: 0:13:11 lr: 0.000009 grad: 0.1962 (0.4093) loss: 0.7882 (0.7815) time: 0.2666 data: 0.0001 max mem: 26157 Train: [83] [3400/6250] eta: 0:12:44 lr: 0.000009 grad: 0.2040 (0.4074) loss: 0.7885 (0.7815) time: 0.2669 data: 0.0001 max mem: 26157 Train: [83] [3500/6250] eta: 0:12:17 lr: 0.000009 grad: 0.2063 (0.4059) loss: 0.7848 (0.7815) time: 0.2672 data: 0.0001 max mem: 26157 Train: [83] [3600/6250] eta: 0:11:51 lr: 0.000009 grad: 0.1987 (0.4079) loss: 0.7779 (0.7814) time: 0.2676 data: 0.0001 max mem: 26157 Train: [83] [3700/6250] eta: 0:11:24 lr: 0.000009 grad: 0.2000 (0.4102) loss: 0.7825 (0.7815) time: 0.2657 data: 0.0001 max mem: 26157 Train: [83] [3800/6250] eta: 0:10:57 lr: 0.000009 grad: 0.1999 (0.4066) loss: 0.7836 (0.7815) time: 0.2669 data: 0.0001 max mem: 26157 Train: [83] [3900/6250] eta: 0:10:30 lr: 0.000009 grad: 0.2105 (0.4039) loss: 0.7768 (0.7814) time: 0.2669 data: 0.0001 max mem: 26157 Train: [83] [4000/6250] eta: 0:10:03 lr: 0.000009 grad: 0.2135 (0.4037) loss: 0.7792 (0.7814) time: 0.2668 data: 0.0001 max mem: 26157 Train: [83] [4100/6250] eta: 0:09:36 lr: 0.000009 grad: 0.2030 (0.4007) loss: 0.7793 (0.7814) time: 0.2686 data: 0.0002 max mem: 26157 Train: [83] [4200/6250] eta: 0:09:09 lr: 0.000009 grad: 0.1916 (0.3983) loss: 0.7848 (0.7814) time: 0.2679 data: 0.0001 max mem: 26157 Train: [83] [4300/6250] eta: 0:08:42 lr: 0.000009 grad: 0.2030 (0.3989) loss: 0.7810 (0.7815) time: 0.2663 data: 0.0001 max mem: 26157 Train: [83] [4400/6250] eta: 0:08:15 lr: 0.000009 grad: 0.1927 (0.4009) loss: 0.7963 (0.7817) time: 0.2680 data: 0.0002 max mem: 26157 Train: [83] [4500/6250] eta: 0:07:49 lr: 0.000009 grad: 0.1952 (0.3979) loss: 0.7805 (0.7818) time: 0.2661 data: 0.0001 max mem: 26157 Train: [83] [4600/6250] eta: 0:07:22 lr: 0.000009 grad: 0.1973 (0.3989) loss: 0.7822 (0.7819) time: 0.2669 data: 0.0001 max mem: 26157 Train: [83] [4700/6250] eta: 0:06:55 lr: 0.000009 grad: 0.2004 (0.3968) loss: 0.7864 (0.7820) time: 0.2669 data: 0.0001 max mem: 26157 Train: [83] [4800/6250] eta: 0:06:28 lr: 0.000009 grad: 0.1938 (0.4002) loss: 0.7824 (0.7821) time: 0.2680 data: 0.0001 max mem: 26157 Train: [83] [4900/6250] eta: 0:06:01 lr: 0.000009 grad: 0.1996 (0.3983) loss: 0.7769 (0.7821) time: 0.2667 data: 0.0001 max mem: 26157 Train: [83] [5000/6250] eta: 0:05:34 lr: 0.000009 grad: 0.2025 (0.3959) loss: 0.7859 (0.7822) time: 0.2676 data: 0.0001 max mem: 26157 Train: [83] [5100/6250] eta: 0:05:08 lr: 0.000009 grad: 0.1956 (0.3960) loss: 0.7824 (0.7822) time: 0.2667 data: 0.0001 max mem: 26157 Train: [83] [5200/6250] eta: 0:04:41 lr: 0.000009 grad: 0.1931 (0.3931) loss: 0.7897 (0.7822) time: 0.2665 data: 0.0001 max mem: 26157 Train: [83] [5300/6250] eta: 0:04:14 lr: 0.000009 grad: 0.1997 (0.3917) loss: 0.7770 (0.7823) time: 0.2665 data: 0.0001 max mem: 26157 Train: [83] [5400/6250] eta: 0:03:47 lr: 0.000009 grad: 0.2009 (0.3917) loss: 0.7781 (0.7823) time: 0.2671 data: 0.0001 max mem: 26157 Train: [83] [5500/6250] eta: 0:03:20 lr: 0.000009 grad: 0.1958 (0.3909) loss: 0.7742 (0.7824) time: 0.2674 data: 0.0001 max mem: 26157 Train: [83] [5600/6250] eta: 0:02:54 lr: 0.000009 grad: 0.1978 (0.3905) loss: 0.7859 (0.7824) time: 0.2665 data: 0.0001 max mem: 26157 Train: [83] [5700/6250] eta: 0:02:27 lr: 0.000009 grad: 0.2064 (0.3928) loss: 0.7752 (0.7824) time: 0.2668 data: 0.0001 max mem: 26157 Train: [83] [5800/6250] eta: 0:02:00 lr: 0.000009 grad: 0.2177 (0.3942) loss: 0.7750 (0.7824) time: 0.2664 data: 0.0001 max mem: 26157 Train: [83] [5900/6250] eta: 0:01:33 lr: 0.000009 grad: 0.1834 (0.3935) loss: 0.7849 (0.7824) time: 0.2687 data: 0.0002 max mem: 26157 Train: [83] [6000/6250] eta: 0:01:06 lr: 0.000009 grad: 0.2142 (0.3942) loss: 0.7805 (0.7823) time: 0.2673 data: 0.0001 max mem: 26157 Train: [83] [6100/6250] eta: 0:00:40 lr: 0.000009 grad: 0.2172 (0.3956) loss: 0.7787 (0.7823) time: 0.2671 data: 0.0002 max mem: 26157 Train: [83] [6200/6250] eta: 0:00:13 lr: 0.000009 grad: 0.2478 (0.3957) loss: 0.7846 (0.7823) time: 0.2670 data: 0.0001 max mem: 26157 Train: [83] [6249/6250] eta: 0:00:00 lr: 0.000009 grad: 0.2068 (0.3953) loss: 0.7829 (0.7823) time: 0.2682 data: 0.0002 max mem: 26157 Train: [83] Total time: 0:27:56 (0.2683 s / it) Averaged stats: lr: 0.000009 grad: 0.2068 (0.3953) loss: 0.7829 (0.7823) Eval (hcp-train-subset): [83] [ 0/62] eta: 0:04:05 loss: 0.8027 (0.8027) time: 3.9616 data: 3.8536 max mem: 26157 Eval (hcp-train-subset): [83] [61/62] eta: 0:00:00 loss: 0.7917 (0.7953) time: 0.1035 data: 0.0193 max mem: 26157 Eval (hcp-train-subset): [83] Total time: 0:00:11 (0.1864 s / it) Averaged stats (hcp-train-subset): loss: 0.7917 (0.7953) Making plots (hcp-train-subset): example=60 Eval (hcp-val): [83] [ 0/62] eta: 0:04:18 loss: 0.8219 (0.8219) time: 4.1754 data: 4.0922 max mem: 26157 Eval (hcp-val): [83] [61/62] eta: 0:00:00 loss: 0.8229 (0.8234) time: 0.0945 data: 0.0104 max mem: 26157 Eval (hcp-val): [83] Total time: 0:00:10 (0.1764 s / it) Averaged stats (hcp-val): loss: 0.8229 (0.8234) Making plots (hcp-val): example=61 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [84] [ 0/6250] eta: 6:21:02 lr: 0.000009 grad: 0.2163 (0.2163) loss: 0.8301 (0.8301) time: 3.6579 data: 3.3164 max mem: 26157 Train: [84] [ 100/6250] eta: 0:31:12 lr: 0.000009 grad: 0.2311 (0.3741) loss: 0.7980 (0.7987) time: 0.2682 data: 0.0002 max mem: 26157 Train: [84] [ 200/6250] eta: 0:28:48 lr: 0.000009 grad: 0.2152 (0.3536) loss: 0.7914 (0.7960) time: 0.2660 data: 0.0001 max mem: 26157 Train: [84] [ 300/6250] eta: 0:27:44 lr: 0.000008 grad: 0.2042 (0.3359) loss: 0.7872 (0.7929) time: 0.2676 data: 0.0001 max mem: 26157 Train: [84] [ 400/6250] eta: 0:26:58 lr: 0.000008 grad: 0.2228 (0.3226) loss: 0.7827 (0.7900) time: 0.2689 data: 0.0002 max mem: 26157 Train: [84] [ 500/6250] eta: 0:26:19 lr: 0.000008 grad: 0.2086 (0.3301) loss: 0.7799 (0.7887) time: 0.2667 data: 0.0001 max mem: 26157 Train: [84] [ 600/6250] eta: 0:25:45 lr: 0.000008 grad: 0.2047 (0.3213) loss: 0.7823 (0.7879) time: 0.2672 data: 0.0001 max mem: 26157 Train: [84] [ 700/6250] eta: 0:25:12 lr: 0.000008 grad: 0.2045 (0.3224) loss: 0.7860 (0.7869) time: 0.2672 data: 0.0001 max mem: 26157 Train: [84] [ 800/6250] eta: 0:24:41 lr: 0.000008 grad: 0.1934 (0.3224) loss: 0.7795 (0.7860) time: 0.2672 data: 0.0001 max mem: 26157 Train: [84] [ 900/6250] eta: 0:24:11 lr: 0.000008 grad: 0.1974 (0.3173) loss: 0.7927 (0.7858) time: 0.2684 data: 0.0002 max mem: 26157 Train: [84] [1000/6250] eta: 0:23:42 lr: 0.000008 grad: 0.1907 (0.3298) loss: 0.7812 (0.7856) time: 0.2669 data: 0.0001 max mem: 26157 Train: [84] [1100/6250] eta: 0:23:13 lr: 0.000008 grad: 0.1985 (0.3281) loss: 0.7866 (0.7854) time: 0.2672 data: 0.0001 max mem: 26157 Train: [84] [1200/6250] eta: 0:22:45 lr: 0.000008 grad: 0.2066 (0.3302) loss: 0.7834 (0.7852) time: 0.2672 data: 0.0001 max mem: 26157 Train: [84] [1300/6250] eta: 0:22:16 lr: 0.000008 grad: 0.1979 (0.3218) loss: 0.7786 (0.7850) time: 0.2674 data: 0.0002 max mem: 26157 Train: [84] [1400/6250] eta: 0:21:48 lr: 0.000008 grad: 0.1780 (0.3200) loss: 0.7836 (0.7847) time: 0.2664 data: 0.0001 max mem: 26157 Train: [84] [1500/6250] eta: 0:21:20 lr: 0.000008 grad: 0.2043 (0.3203) loss: 0.7805 (0.7844) time: 0.2669 data: 0.0002 max mem: 26157 Train: [84] [1600/6250] eta: 0:20:52 lr: 0.000008 grad: 0.1990 (0.3147) loss: 0.7840 (0.7843) time: 0.2666 data: 0.0001 max mem: 26157 Train: [84] [1700/6250] eta: 0:20:25 lr: 0.000008 grad: 0.1908 (0.3190) loss: 0.7823 (0.7839) time: 0.2664 data: 0.0001 max mem: 26157 Train: [84] [1800/6250] eta: 0:19:57 lr: 0.000008 grad: 0.1913 (0.3151) loss: 0.7835 (0.7837) time: 0.2665 data: 0.0001 max mem: 26157 Train: [84] [1900/6250] eta: 0:19:30 lr: 0.000008 grad: 0.1985 (0.3257) loss: 0.7712 (0.7835) time: 0.2670 data: 0.0001 max mem: 26157 Train: [84] [2000/6250] eta: 0:19:03 lr: 0.000008 grad: 0.1954 (0.3266) loss: 0.7847 (0.7832) time: 0.2663 data: 0.0001 max mem: 26157 Train: [84] [2100/6250] eta: 0:18:35 lr: 0.000008 grad: 0.1939 (0.3242) loss: 0.7866 (0.7832) time: 0.2674 data: 0.0001 max mem: 26157 Train: [84] [2200/6250] eta: 0:18:08 lr: 0.000008 grad: 0.2041 (0.3270) loss: 0.7794 (0.7830) time: 0.2685 data: 0.0001 max mem: 26157 Train: [84] [2300/6250] eta: 0:17:41 lr: 0.000008 grad: 0.1903 (0.3238) loss: 0.7789 (0.7830) time: 0.2670 data: 0.0001 max mem: 26157 Train: [84] [2400/6250] eta: 0:17:14 lr: 0.000008 grad: 0.2063 (0.3260) loss: 0.7820 (0.7830) time: 0.2663 data: 0.0001 max mem: 26157 Train: [84] [2500/6250] eta: 0:16:47 lr: 0.000008 grad: 0.1860 (0.3242) loss: 0.7843 (0.7831) time: 0.2672 data: 0.0001 max mem: 26157 Train: [84] [2600/6250] eta: 0:16:20 lr: 0.000008 grad: 0.1926 (0.3242) loss: 0.7723 (0.7831) time: 0.2685 data: 0.0002 max mem: 26157 Train: [84] [2700/6250] eta: 0:15:53 lr: 0.000008 grad: 0.1946 (0.3266) loss: 0.7840 (0.7830) time: 0.2661 data: 0.0001 max mem: 26157 Train: [84] [2800/6250] eta: 0:15:26 lr: 0.000008 grad: 0.1878 (0.3261) loss: 0.7855 (0.7829) time: 0.2664 data: 0.0001 max mem: 26157 Train: [84] [2900/6250] eta: 0:14:59 lr: 0.000008 grad: 0.1850 (0.3273) loss: 0.7843 (0.7829) time: 0.2670 data: 0.0001 max mem: 26157 Train: [84] [3000/6250] eta: 0:14:31 lr: 0.000008 grad: 0.2030 (0.3279) loss: 0.7805 (0.7829) time: 0.2669 data: 0.0001 max mem: 26157 Train: [84] [3100/6250] eta: 0:14:05 lr: 0.000008 grad: 0.1963 (0.3278) loss: 0.7839 (0.7828) time: 0.2666 data: 0.0001 max mem: 26157 Train: [84] [3200/6250] eta: 0:13:38 lr: 0.000008 grad: 0.2055 (0.3325) loss: 0.7830 (0.7828) time: 0.2668 data: 0.0002 max mem: 26157 Train: [84] [3300/6250] eta: 0:13:11 lr: 0.000008 grad: 0.2103 (0.3306) loss: 0.7813 (0.7827) time: 0.2681 data: 0.0001 max mem: 26157 Train: [84] [3400/6250] eta: 0:12:44 lr: 0.000008 grad: 0.2036 (0.3287) loss: 0.7782 (0.7829) time: 0.2671 data: 0.0001 max mem: 26157 Train: [84] [3500/6250] eta: 0:12:17 lr: 0.000008 grad: 0.1969 (0.3293) loss: 0.7846 (0.7829) time: 0.2664 data: 0.0001 max mem: 26157 Train: [84] [3600/6250] eta: 0:11:50 lr: 0.000008 grad: 0.2013 (0.3272) loss: 0.7871 (0.7830) time: 0.2671 data: 0.0001 max mem: 26157 Train: [84] [3700/6250] eta: 0:11:23 lr: 0.000008 grad: 0.1897 (0.3274) loss: 0.7860 (0.7830) time: 0.2670 data: 0.0001 max mem: 26157 Train: [84] [3800/6250] eta: 0:10:56 lr: 0.000008 grad: 0.1923 (0.3263) loss: 0.7846 (0.7831) time: 0.2672 data: 0.0002 max mem: 26157 Train: [84] [3900/6250] eta: 0:10:29 lr: 0.000008 grad: 0.1991 (0.3266) loss: 0.7859 (0.7831) time: 0.2680 data: 0.0001 max mem: 26157 Train: [84] [4000/6250] eta: 0:10:02 lr: 0.000008 grad: 0.2107 (0.3275) loss: 0.7836 (0.7832) time: 0.2675 data: 0.0001 max mem: 26157 Train: [84] [4100/6250] eta: 0:09:36 lr: 0.000008 grad: 0.2058 (0.3274) loss: 0.7889 (0.7832) time: 0.2670 data: 0.0001 max mem: 26157 Train: [84] [4200/6250] eta: 0:09:09 lr: 0.000008 grad: 0.1972 (0.3300) loss: 0.7857 (0.7831) time: 0.2668 data: 0.0002 max mem: 26157 Train: [84] [4300/6250] eta: 0:08:42 lr: 0.000008 grad: 0.1926 (0.3331) loss: 0.7840 (0.7832) time: 0.2670 data: 0.0001 max mem: 26157 Train: [84] [4400/6250] eta: 0:08:15 lr: 0.000008 grad: 0.1979 (0.3331) loss: 0.7897 (0.7833) time: 0.2663 data: 0.0001 max mem: 26157 Train: [84] [4500/6250] eta: 0:07:48 lr: 0.000008 grad: 0.2032 (0.3314) loss: 0.7869 (0.7833) time: 0.2663 data: 0.0001 max mem: 26157 Train: [84] [4600/6250] eta: 0:07:21 lr: 0.000008 grad: 0.2020 (0.3327) loss: 0.7797 (0.7833) time: 0.2669 data: 0.0002 max mem: 26157 Train: [84] [4700/6250] eta: 0:06:55 lr: 0.000008 grad: 0.2075 (0.3341) loss: 0.7819 (0.7832) time: 0.2678 data: 0.0001 max mem: 26157 Train: [84] [4800/6250] eta: 0:06:28 lr: 0.000008 grad: 0.2220 (0.3371) loss: 0.7746 (0.7830) time: 0.2669 data: 0.0001 max mem: 26157 Train: [84] [4900/6250] eta: 0:06:01 lr: 0.000008 grad: 0.2100 (0.3400) loss: 0.7754 (0.7829) time: 0.2677 data: 0.0001 max mem: 26157 Train: [84] [5000/6250] eta: 0:05:34 lr: 0.000008 grad: 0.2097 (0.3411) loss: 0.7702 (0.7828) time: 0.2670 data: 0.0001 max mem: 26157 Train: [84] [5100/6250] eta: 0:05:07 lr: 0.000008 grad: 0.2189 (0.3429) loss: 0.7703 (0.7827) time: 0.2671 data: 0.0002 max mem: 26157 Train: [84] [5200/6250] eta: 0:04:41 lr: 0.000008 grad: 0.2251 (0.3414) loss: 0.7839 (0.7826) time: 0.2667 data: 0.0001 max mem: 26157 Train: [84] [5300/6250] eta: 0:04:14 lr: 0.000008 grad: 0.2086 (0.3409) loss: 0.7821 (0.7825) time: 0.2678 data: 0.0001 max mem: 26157 Train: [84] [5400/6250] eta: 0:03:47 lr: 0.000008 grad: 0.2177 (0.3411) loss: 0.7770 (0.7825) time: 0.2672 data: 0.0001 max mem: 26157 Train: [84] [5500/6250] eta: 0:03:20 lr: 0.000008 grad: 0.2089 (0.3426) loss: 0.7775 (0.7825) time: 0.2672 data: 0.0001 max mem: 26157 Train: [84] [5600/6250] eta: 0:02:54 lr: 0.000008 grad: 0.2315 (0.3452) loss: 0.7826 (0.7824) time: 0.2669 data: 0.0001 max mem: 26157 Train: [84] [5700/6250] eta: 0:02:27 lr: 0.000008 grad: 0.1960 (0.3437) loss: 0.7857 (0.7824) time: 0.2662 data: 0.0001 max mem: 26157 Train: [84] [5800/6250] eta: 0:02:00 lr: 0.000008 grad: 0.2074 (0.3435) loss: 0.7798 (0.7824) time: 0.2665 data: 0.0001 max mem: 26157 Train: [84] [5900/6250] eta: 0:01:33 lr: 0.000008 grad: 0.2014 (0.3429) loss: 0.7744 (0.7824) time: 0.2662 data: 0.0001 max mem: 26157 Train: [84] [6000/6250] eta: 0:01:06 lr: 0.000008 grad: 0.2098 (0.3436) loss: 0.7679 (0.7824) time: 0.2670 data: 0.0001 max mem: 26157 Train: [84] [6100/6250] eta: 0:00:40 lr: 0.000008 grad: 0.2105 (0.3456) loss: 0.7832 (0.7823) time: 0.2669 data: 0.0001 max mem: 26157 Train: [84] [6200/6250] eta: 0:00:13 lr: 0.000008 grad: 0.2119 (0.3460) loss: 0.7821 (0.7823) time: 0.2670 data: 0.0001 max mem: 26157 Train: [84] [6249/6250] eta: 0:00:00 lr: 0.000008 grad: 0.2009 (0.3451) loss: 0.7861 (0.7823) time: 0.2671 data: 0.0001 max mem: 26157 Train: [84] Total time: 0:27:55 (0.2681 s / it) Averaged stats: lr: 0.000008 grad: 0.2009 (0.3451) loss: 0.7861 (0.7823) Eval (hcp-train-subset): [84] [ 0/62] eta: 0:03:17 loss: 0.8040 (0.8040) time: 3.1897 data: 3.0815 max mem: 26157 Eval (hcp-train-subset): [84] [61/62] eta: 0:00:00 loss: 0.7893 (0.7938) time: 0.0844 data: 0.0001 max mem: 26157 Eval (hcp-train-subset): [84] Total time: 0:00:10 (0.1695 s / it) Averaged stats (hcp-train-subset): loss: 0.7893 (0.7938) Making plots (hcp-train-subset): example=56 Eval (hcp-val): [84] [ 0/62] eta: 0:04:40 loss: 0.8131 (0.8131) time: 4.5228 data: 4.4400 max mem: 26157 Eval (hcp-val): [84] [61/62] eta: 0:00:00 loss: 0.8215 (0.8232) time: 0.0820 data: 0.0001 max mem: 26157 Eval (hcp-val): [84] Total time: 0:00:10 (0.1697 s / it) Averaged stats (hcp-val): loss: 0.8215 (0.8232) Making plots (hcp-val): example=2 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [85] [ 0/6250] eta: 5:53:53 lr: 0.000008 grad: 0.2158 (0.2158) loss: 0.8463 (0.8463) time: 3.3974 data: 3.0715 max mem: 26157 Train: [85] [ 100/6250] eta: 0:32:20 lr: 0.000008 grad: 0.2053 (0.4266) loss: 0.7835 (0.7896) time: 0.2677 data: 0.0001 max mem: 26157 Train: [85] [ 200/6250] eta: 0:29:22 lr: 0.000008 grad: 0.2322 (0.3920) loss: 0.7797 (0.7861) time: 0.2666 data: 0.0001 max mem: 26157 Train: [85] [ 300/6250] eta: 0:28:04 lr: 0.000007 grad: 0.2496 (0.4125) loss: 0.7863 (0.7853) time: 0.2662 data: 0.0001 max mem: 26157 Train: [85] [ 400/6250] eta: 0:27:12 lr: 0.000007 grad: 0.2021 (0.3939) loss: 0.7883 (0.7850) time: 0.2676 data: 0.0001 max mem: 26157 Train: [85] [ 500/6250] eta: 0:26:31 lr: 0.000007 grad: 0.2162 (0.4093) loss: 0.7729 (0.7851) time: 0.2675 data: 0.0001 max mem: 26157 Train: [85] [ 600/6250] eta: 0:25:55 lr: 0.000007 grad: 0.2035 (0.4087) loss: 0.7823 (0.7853) time: 0.2667 data: 0.0001 max mem: 26157 Train: [85] [ 700/6250] eta: 0:25:20 lr: 0.000007 grad: 0.2020 (0.4104) loss: 0.7811 (0.7853) time: 0.2667 data: 0.0001 max mem: 26157 Train: [85] [ 800/6250] eta: 0:24:49 lr: 0.000007 grad: 0.1899 (0.4022) loss: 0.7730 (0.7849) time: 0.2672 data: 0.0001 max mem: 26157 Train: [85] [ 900/6250] eta: 0:24:19 lr: 0.000007 grad: 0.2093 (0.3853) loss: 0.7800 (0.7844) time: 0.2676 data: 0.0001 max mem: 26157 Train: [85] [1000/6250] eta: 0:23:48 lr: 0.000007 grad: 0.2115 (0.3909) loss: 0.7811 (0.7842) time: 0.2664 data: 0.0001 max mem: 26157 Train: [85] [1100/6250] eta: 0:23:18 lr: 0.000007 grad: 0.1915 (0.3858) loss: 0.7843 (0.7840) time: 0.2666 data: 0.0001 max mem: 26157 Train: [85] [1200/6250] eta: 0:22:49 lr: 0.000007 grad: 0.2067 (0.3740) loss: 0.7780 (0.7836) time: 0.2666 data: 0.0001 max mem: 26157 Train: [85] [1300/6250] eta: 0:22:21 lr: 0.000007 grad: 0.2191 (0.3791) loss: 0.7746 (0.7834) time: 0.2671 data: 0.0001 max mem: 26157 Train: [85] [1400/6250] eta: 0:21:52 lr: 0.000007 grad: 0.2411 (0.3720) loss: 0.7795 (0.7831) time: 0.2666 data: 0.0001 max mem: 26157 Train: [85] [1500/6250] eta: 0:21:24 lr: 0.000007 grad: 0.2021 (0.3670) loss: 0.7884 (0.7828) time: 0.2676 data: 0.0001 max mem: 26157 Train: [85] [1600/6250] eta: 0:20:56 lr: 0.000007 grad: 0.2005 (0.3645) loss: 0.7794 (0.7828) time: 0.2664 data: 0.0001 max mem: 26157 Train: [85] [1700/6250] eta: 0:20:28 lr: 0.000007 grad: 0.2005 (0.3651) loss: 0.7781 (0.7827) time: 0.2683 data: 0.0002 max mem: 26157 Train: [85] [1800/6250] eta: 0:20:01 lr: 0.000007 grad: 0.2066 (0.3647) loss: 0.7842 (0.7827) time: 0.2669 data: 0.0001 max mem: 26157 Train: [85] [1900/6250] eta: 0:19:33 lr: 0.000007 grad: 0.2086 (0.3695) loss: 0.7829 (0.7826) time: 0.2668 data: 0.0001 max mem: 26157 Train: [85] [2000/6250] eta: 0:19:05 lr: 0.000007 grad: 0.1985 (0.3687) loss: 0.7728 (0.7824) time: 0.2672 data: 0.0001 max mem: 26157 Train: [85] [2100/6250] eta: 0:18:38 lr: 0.000007 grad: 0.1975 (0.3658) loss: 0.7782 (0.7823) time: 0.2674 data: 0.0001 max mem: 26157 Train: [85] [2200/6250] eta: 0:18:10 lr: 0.000007 grad: 0.1974 (0.3631) loss: 0.7812 (0.7823) time: 0.2665 data: 0.0001 max mem: 26157 Train: [85] [2300/6250] eta: 0:17:43 lr: 0.000007 grad: 0.1967 (0.3613) loss: 0.7855 (0.7823) time: 0.2666 data: 0.0001 max mem: 26157 Train: [85] [2400/6250] eta: 0:17:16 lr: 0.000007 grad: 0.2018 (0.3590) loss: 0.7834 (0.7823) time: 0.2669 data: 0.0002 max mem: 26157 Train: [85] [2500/6250] eta: 0:16:48 lr: 0.000007 grad: 0.2131 (0.3632) loss: 0.7768 (0.7821) time: 0.2665 data: 0.0001 max mem: 26157 Train: [85] [2600/6250] eta: 0:16:21 lr: 0.000007 grad: 0.1950 (0.3610) loss: 0.7703 (0.7820) time: 0.2667 data: 0.0001 max mem: 26157 Train: [85] [2700/6250] eta: 0:15:54 lr: 0.000007 grad: 0.2061 (0.3590) loss: 0.7746 (0.7818) time: 0.2664 data: 0.0001 max mem: 26157 Train: [85] [2800/6250] eta: 0:15:27 lr: 0.000007 grad: 0.2152 (0.3588) loss: 0.7811 (0.7818) time: 0.2672 data: 0.0001 max mem: 26157 Train: [85] [2900/6250] eta: 0:15:00 lr: 0.000007 grad: 0.2111 (0.3633) loss: 0.7777 (0.7817) time: 0.2666 data: 0.0001 max mem: 26157 Train: [85] [3000/6250] eta: 0:14:33 lr: 0.000007 grad: 0.2056 (0.3639) loss: 0.7782 (0.7816) time: 0.2667 data: 0.0001 max mem: 26157 Train: [85] [3100/6250] eta: 0:14:06 lr: 0.000007 grad: 0.2011 (0.3651) loss: 0.7810 (0.7815) time: 0.2674 data: 0.0001 max mem: 26157 Train: [85] [3200/6250] eta: 0:13:39 lr: 0.000007 grad: 0.2015 (0.3627) loss: 0.7738 (0.7814) time: 0.2662 data: 0.0001 max mem: 26157 Train: [85] [3300/6250] eta: 0:13:12 lr: 0.000007 grad: 0.2057 (0.3618) loss: 0.7803 (0.7812) time: 0.2687 data: 0.0001 max mem: 26157 Train: [85] [3400/6250] eta: 0:12:45 lr: 0.000007 grad: 0.1961 (0.3607) loss: 0.7812 (0.7811) time: 0.2668 data: 0.0001 max mem: 26157 Train: [85] [3500/6250] eta: 0:12:18 lr: 0.000007 grad: 0.1911 (0.3578) loss: 0.7818 (0.7811) time: 0.2667 data: 0.0002 max mem: 26157 Train: [85] [3600/6250] eta: 0:11:51 lr: 0.000007 grad: 0.2135 (0.3547) loss: 0.7803 (0.7811) time: 0.2678 data: 0.0002 max mem: 26157 Train: [85] [3700/6250] eta: 0:11:24 lr: 0.000007 grad: 0.2192 (0.3558) loss: 0.7820 (0.7812) time: 0.2671 data: 0.0001 max mem: 26157 Train: [85] [3800/6250] eta: 0:10:57 lr: 0.000007 grad: 0.2175 (0.3557) loss: 0.7752 (0.7812) time: 0.2667 data: 0.0001 max mem: 26157 Train: [85] [3900/6250] eta: 0:10:30 lr: 0.000007 grad: 0.1916 (0.3582) loss: 0.7771 (0.7813) time: 0.2682 data: 0.0002 max mem: 26157 Train: [85] [4000/6250] eta: 0:10:03 lr: 0.000007 grad: 0.2054 (0.3609) loss: 0.7787 (0.7812) time: 0.2666 data: 0.0001 max mem: 26157 Train: [85] [4100/6250] eta: 0:09:37 lr: 0.000007 grad: 0.2070 (0.3598) loss: 0.7820 (0.7812) time: 0.2669 data: 0.0001 max mem: 26157 Train: [85] [4200/6250] eta: 0:09:10 lr: 0.000007 grad: 0.2131 (0.3575) loss: 0.7780 (0.7812) time: 0.2670 data: 0.0001 max mem: 26157 Train: [85] [4300/6250] eta: 0:08:43 lr: 0.000007 grad: 0.2127 (0.3561) loss: 0.7851 (0.7812) time: 0.2665 data: 0.0001 max mem: 26157 Train: [85] [4400/6250] eta: 0:08:16 lr: 0.000007 grad: 0.1989 (0.3563) loss: 0.7830 (0.7812) time: 0.2666 data: 0.0002 max mem: 26157 Train: [85] [4500/6250] eta: 0:07:49 lr: 0.000007 grad: 0.1935 (0.3556) loss: 0.7896 (0.7813) time: 0.2676 data: 0.0002 max mem: 26157 Train: [85] [4600/6250] eta: 0:07:22 lr: 0.000007 grad: 0.1941 (0.3539) loss: 0.7810 (0.7813) time: 0.2662 data: 0.0001 max mem: 26157 Train: [85] [4700/6250] eta: 0:06:55 lr: 0.000007 grad: 0.1933 (0.3547) loss: 0.7814 (0.7812) time: 0.2662 data: 0.0001 max mem: 26157 Train: [85] [4800/6250] eta: 0:06:28 lr: 0.000007 grad: 0.2095 (0.3557) loss: 0.7772 (0.7812) time: 0.2668 data: 0.0001 max mem: 26157 Train: [85] [4900/6250] eta: 0:06:02 lr: 0.000007 grad: 0.2046 (0.3555) loss: 0.7858 (0.7813) time: 0.2667 data: 0.0001 max mem: 26157 Train: [85] [5000/6250] eta: 0:05:35 lr: 0.000007 grad: 0.2032 (0.3576) loss: 0.7831 (0.7813) time: 0.2672 data: 0.0001 max mem: 26157 Train: [85] [5100/6250] eta: 0:05:08 lr: 0.000007 grad: 0.2027 (0.3614) loss: 0.7897 (0.7814) time: 0.2670 data: 0.0001 max mem: 26157 Train: [85] [5200/6250] eta: 0:04:41 lr: 0.000007 grad: 0.1989 (0.3620) loss: 0.7866 (0.7814) time: 0.2671 data: 0.0001 max mem: 26157 Train: [85] [5300/6250] eta: 0:04:14 lr: 0.000007 grad: 0.2019 (0.3645) loss: 0.7822 (0.7814) time: 0.2665 data: 0.0001 max mem: 26157 Train: [85] [5400/6250] eta: 0:03:47 lr: 0.000007 grad: 0.2082 (0.3643) loss: 0.7867 (0.7814) time: 0.2669 data: 0.0001 max mem: 26157 Train: [85] [5500/6250] eta: 0:03:21 lr: 0.000007 grad: 0.2040 (0.3646) loss: 0.7890 (0.7815) time: 0.2662 data: 0.0001 max mem: 26157 Train: [85] [5600/6250] eta: 0:02:54 lr: 0.000007 grad: 0.2122 (0.3630) loss: 0.7761 (0.7815) time: 0.2666 data: 0.0001 max mem: 26157 Train: [85] [5700/6250] eta: 0:02:27 lr: 0.000007 grad: 0.2006 (0.3613) loss: 0.7822 (0.7816) time: 0.2673 data: 0.0002 max mem: 26157 Train: [85] [5800/6250] eta: 0:02:00 lr: 0.000007 grad: 0.1956 (0.3666) loss: 0.7814 (0.7816) time: 0.2672 data: 0.0002 max mem: 26157 Train: [85] [5900/6250] eta: 0:01:33 lr: 0.000007 grad: 0.1992 (0.3652) loss: 0.7798 (0.7817) time: 0.2666 data: 0.0001 max mem: 26157 Train: [85] [6000/6250] eta: 0:01:07 lr: 0.000007 grad: 0.1971 (0.3642) loss: 0.7852 (0.7818) time: 0.2666 data: 0.0002 max mem: 26157 Train: [85] [6100/6250] eta: 0:00:40 lr: 0.000007 grad: 0.1952 (0.3626) loss: 0.7856 (0.7818) time: 0.2667 data: 0.0001 max mem: 26157 Train: [85] [6200/6250] eta: 0:00:13 lr: 0.000007 grad: 0.1942 (0.3615) loss: 0.7831 (0.7818) time: 0.2673 data: 0.0001 max mem: 26157 Train: [85] [6249/6250] eta: 0:00:00 lr: 0.000007 grad: 0.1940 (0.3632) loss: 0.7879 (0.7818) time: 0.2698 data: 0.0002 max mem: 26157 Train: [85] Total time: 0:27:58 (0.2685 s / it) Averaged stats: lr: 0.000007 grad: 0.1940 (0.3632) loss: 0.7879 (0.7818) Eval (hcp-train-subset): [85] [ 0/62] eta: 0:03:11 loss: 0.8007 (0.8007) time: 3.0867 data: 2.9799 max mem: 26157 Eval (hcp-train-subset): [85] [61/62] eta: 0:00:00 loss: 0.7914 (0.7948) time: 0.0995 data: 0.0151 max mem: 26157 Eval (hcp-train-subset): [85] Total time: 0:00:10 (0.1716 s / it) Averaged stats (hcp-train-subset): loss: 0.7914 (0.7948) Making plots (hcp-train-subset): example=46 Eval (hcp-val): [85] [ 0/62] eta: 0:03:21 loss: 0.8183 (0.8183) time: 3.2504 data: 3.1478 max mem: 26157 Eval (hcp-val): [85] [61/62] eta: 0:00:00 loss: 0.8235 (0.8240) time: 0.0878 data: 0.0051 max mem: 26157 Eval (hcp-val): [85] Total time: 0:00:11 (0.1782 s / it) Averaged stats (hcp-val): loss: 0.8235 (0.8240) Making plots (hcp-val): example=8 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [86] [ 0/6250] eta: 9:04:10 lr: 0.000007 grad: 0.2156 (0.2156) loss: 0.7899 (0.7899) time: 5.2240 data: 4.9463 max mem: 26157 Train: [86] [ 100/6250] eta: 0:32:29 lr: 0.000007 grad: 0.2885 (0.2881) loss: 0.7930 (0.8023) time: 0.2677 data: 0.0001 max mem: 26157 Train: [86] [ 200/6250] eta: 0:29:26 lr: 0.000007 grad: 0.2052 (0.3529) loss: 0.7981 (0.7951) time: 0.2658 data: 0.0001 max mem: 26157 Train: [86] [ 300/6250] eta: 0:28:07 lr: 0.000007 grad: 0.2289 (0.3668) loss: 0.7812 (0.7921) time: 0.2662 data: 0.0001 max mem: 26157 Train: [86] [ 400/6250] eta: 0:27:15 lr: 0.000007 grad: 0.2289 (0.3875) loss: 0.7826 (0.7900) time: 0.2668 data: 0.0001 max mem: 26157 Train: [86] [ 500/6250] eta: 0:26:33 lr: 0.000007 grad: 0.2404 (0.3759) loss: 0.7819 (0.7887) time: 0.2669 data: 0.0001 max mem: 26157 Train: [86] [ 600/6250] eta: 0:25:56 lr: 0.000006 grad: 0.2165 (0.3819) loss: 0.7891 (0.7879) time: 0.2666 data: 0.0001 max mem: 26157 Train: [86] [ 700/6250] eta: 0:25:21 lr: 0.000006 grad: 0.2075 (0.4107) loss: 0.7833 (0.7879) time: 0.2669 data: 0.0001 max mem: 26157 Train: [86] [ 800/6250] eta: 0:24:49 lr: 0.000006 grad: 0.1980 (0.4160) loss: 0.7860 (0.7880) time: 0.2664 data: 0.0001 max mem: 26157 Train: [86] [ 900/6250] eta: 0:24:18 lr: 0.000006 grad: 0.2285 (0.4092) loss: 0.7813 (0.7879) time: 0.2680 data: 0.0001 max mem: 26157 Train: [86] [1000/6250] eta: 0:23:48 lr: 0.000006 grad: 0.2183 (0.3999) loss: 0.7827 (0.7877) time: 0.2664 data: 0.0001 max mem: 26157 Train: [86] [1100/6250] eta: 0:23:18 lr: 0.000006 grad: 0.2110 (0.3830) loss: 0.7855 (0.7874) time: 0.2670 data: 0.0001 max mem: 26157 Train: [86] [1200/6250] eta: 0:22:49 lr: 0.000006 grad: 0.2254 (0.3831) loss: 0.7833 (0.7872) time: 0.2662 data: 0.0001 max mem: 26157 Train: [86] [1300/6250] eta: 0:22:20 lr: 0.000006 grad: 0.2014 (0.3727) loss: 0.7844 (0.7869) time: 0.2667 data: 0.0001 max mem: 26157 Train: [86] [1400/6250] eta: 0:21:52 lr: 0.000006 grad: 0.2033 (0.3754) loss: 0.7856 (0.7867) time: 0.2669 data: 0.0001 max mem: 26157 Train: [86] [1500/6250] eta: 0:21:24 lr: 0.000006 grad: 0.2142 (0.3682) loss: 0.7825 (0.7867) time: 0.2672 data: 0.0001 max mem: 26157 Train: [86] [1600/6250] eta: 0:20:56 lr: 0.000006 grad: 0.2064 (0.3635) loss: 0.7835 (0.7866) time: 0.2666 data: 0.0001 max mem: 26157 Train: [86] [1700/6250] eta: 0:20:28 lr: 0.000006 grad: 0.1970 (0.3558) loss: 0.7877 (0.7865) time: 0.2665 data: 0.0001 max mem: 26157 Train: [86] [1800/6250] eta: 0:20:00 lr: 0.000006 grad: 0.1949 (0.3483) loss: 0.7789 (0.7864) time: 0.2665 data: 0.0001 max mem: 26157 Train: [86] [1900/6250] eta: 0:19:32 lr: 0.000006 grad: 0.1981 (0.3473) loss: 0.7817 (0.7863) time: 0.2672 data: 0.0002 max mem: 26157 Train: [86] [2000/6250] eta: 0:19:05 lr: 0.000006 grad: 0.1843 (0.3447) loss: 0.7856 (0.7862) time: 0.2666 data: 0.0001 max mem: 26157 Train: [86] [2100/6250] eta: 0:18:37 lr: 0.000006 grad: 0.1910 (0.3442) loss: 0.7842 (0.7861) time: 0.2664 data: 0.0001 max mem: 26157 Train: [86] [2200/6250] eta: 0:18:10 lr: 0.000006 grad: 0.1976 (0.3403) loss: 0.7855 (0.7859) time: 0.2667 data: 0.0001 max mem: 26157 Train: [86] [2300/6250] eta: 0:17:42 lr: 0.000006 grad: 0.1951 (0.3362) loss: 0.7814 (0.7858) time: 0.2666 data: 0.0001 max mem: 26157 Train: [86] [2400/6250] eta: 0:17:15 lr: 0.000006 grad: 0.2059 (0.3364) loss: 0.7791 (0.7855) time: 0.2679 data: 0.0001 max mem: 26157 Train: [86] [2500/6250] eta: 0:16:48 lr: 0.000006 grad: 0.2082 (0.3366) loss: 0.7807 (0.7854) time: 0.2667 data: 0.0001 max mem: 26157 Train: [86] [2600/6250] eta: 0:16:21 lr: 0.000006 grad: 0.2161 (0.3401) loss: 0.7703 (0.7851) time: 0.2669 data: 0.0001 max mem: 26157 Train: [86] [2700/6250] eta: 0:15:54 lr: 0.000006 grad: 0.2117 (0.3380) loss: 0.7784 (0.7849) time: 0.2663 data: 0.0001 max mem: 26157 Train: [86] [2800/6250] eta: 0:15:26 lr: 0.000006 grad: 0.2184 (0.3377) loss: 0.7805 (0.7848) time: 0.2662 data: 0.0001 max mem: 26157 Train: [86] [2900/6250] eta: 0:14:59 lr: 0.000006 grad: 0.2256 (0.3414) loss: 0.7752 (0.7846) time: 0.2671 data: 0.0001 max mem: 26157 Train: [86] [3000/6250] eta: 0:14:32 lr: 0.000006 grad: 0.2134 (0.3450) loss: 0.7825 (0.7846) time: 0.2676 data: 0.0001 max mem: 26157 Train: [86] [3100/6250] eta: 0:14:05 lr: 0.000006 grad: 0.1999 (0.3444) loss: 0.7851 (0.7845) time: 0.2673 data: 0.0001 max mem: 26157 Train: [86] [3200/6250] eta: 0:13:38 lr: 0.000006 grad: 0.2181 (0.3424) loss: 0.7807 (0.7843) time: 0.2659 data: 0.0001 max mem: 26157 Train: [86] [3300/6250] eta: 0:13:11 lr: 0.000006 grad: 0.2177 (0.3454) loss: 0.7827 (0.7844) time: 0.2667 data: 0.0001 max mem: 26157 Train: [86] [3400/6250] eta: 0:12:44 lr: 0.000006 grad: 0.2007 (0.3459) loss: 0.7804 (0.7843) time: 0.2662 data: 0.0001 max mem: 26157 Train: [86] [3500/6250] eta: 0:12:17 lr: 0.000006 grad: 0.2208 (0.3445) loss: 0.7763 (0.7843) time: 0.2667 data: 0.0001 max mem: 26157 Train: [86] [3600/6250] eta: 0:11:51 lr: 0.000006 grad: 0.2099 (0.3474) loss: 0.7837 (0.7843) time: 0.2678 data: 0.0001 max mem: 26157 Train: [86] [3700/6250] eta: 0:11:24 lr: 0.000006 grad: 0.2291 (0.3470) loss: 0.7822 (0.7843) time: 0.2672 data: 0.0001 max mem: 26157 Train: [86] [3800/6250] eta: 0:10:57 lr: 0.000006 grad: 0.2026 (0.3483) loss: 0.7793 (0.7843) time: 0.2667 data: 0.0001 max mem: 26157 Train: [86] [3900/6250] eta: 0:10:30 lr: 0.000006 grad: 0.2048 (0.3481) loss: 0.7811 (0.7842) time: 0.2666 data: 0.0001 max mem: 26157 Train: [86] [4000/6250] eta: 0:10:03 lr: 0.000006 grad: 0.2056 (0.3472) loss: 0.7818 (0.7842) time: 0.2661 data: 0.0001 max mem: 26157 Train: [86] [4100/6250] eta: 0:09:36 lr: 0.000006 grad: 0.2175 (0.3453) loss: 0.7775 (0.7842) time: 0.2665 data: 0.0001 max mem: 26157 Train: [86] [4200/6250] eta: 0:09:09 lr: 0.000006 grad: 0.2129 (0.3446) loss: 0.7811 (0.7840) time: 0.2666 data: 0.0001 max mem: 26157 Train: [86] [4300/6250] eta: 0:08:42 lr: 0.000006 grad: 0.2193 (0.3535) loss: 0.7778 (0.7840) time: 0.2670 data: 0.0001 max mem: 26157 Train: [86] [4400/6250] eta: 0:08:15 lr: 0.000006 grad: 0.2005 (0.3574) loss: 0.7826 (0.7839) time: 0.2666 data: 0.0001 max mem: 26157 Train: [86] [4500/6250] eta: 0:07:49 lr: 0.000006 grad: 0.2092 (0.3587) loss: 0.7815 (0.7840) time: 0.2674 data: 0.0002 max mem: 26157 Train: [86] [4600/6250] eta: 0:07:22 lr: 0.000006 grad: 0.1985 (0.3598) loss: 0.7837 (0.7840) time: 0.2664 data: 0.0001 max mem: 26157 Train: [86] [4700/6250] eta: 0:06:55 lr: 0.000006 grad: 0.2132 (0.3589) loss: 0.7835 (0.7840) time: 0.2676 data: 0.0001 max mem: 26157 Train: [86] [4800/6250] eta: 0:06:28 lr: 0.000006 grad: 0.1976 (0.3572) loss: 0.7816 (0.7840) time: 0.2670 data: 0.0001 max mem: 26157 Train: [86] [4900/6250] eta: 0:06:01 lr: 0.000006 grad: 0.2039 (0.3553) loss: 0.7830 (0.7840) time: 0.2662 data: 0.0002 max mem: 26157 Train: [86] [5000/6250] eta: 0:05:34 lr: 0.000006 grad: 0.2010 (0.3549) loss: 0.7841 (0.7840) time: 0.2665 data: 0.0001 max mem: 26157 Train: [86] [5100/6250] eta: 0:05:08 lr: 0.000006 grad: 0.1931 (0.3557) loss: 0.7889 (0.7840) time: 0.2667 data: 0.0001 max mem: 26157 Train: [86] [5200/6250] eta: 0:04:41 lr: 0.000006 grad: 0.2014 (0.3541) loss: 0.7833 (0.7840) time: 0.2664 data: 0.0001 max mem: 26157 Train: [86] [5300/6250] eta: 0:04:14 lr: 0.000006 grad: 0.2014 (0.3541) loss: 0.7861 (0.7840) time: 0.2665 data: 0.0001 max mem: 26157 Train: [86] [5400/6250] eta: 0:03:47 lr: 0.000006 grad: 0.1914 (0.3544) loss: 0.7752 (0.7840) time: 0.2666 data: 0.0001 max mem: 26157 Train: [86] [5500/6250] eta: 0:03:20 lr: 0.000006 grad: 0.2125 (0.3545) loss: 0.7819 (0.7839) time: 0.2660 data: 0.0001 max mem: 26157 Train: [86] [5600/6250] eta: 0:02:54 lr: 0.000006 grad: 0.2168 (0.3546) loss: 0.7839 (0.7839) time: 0.2677 data: 0.0001 max mem: 26157 Train: [86] [5700/6250] eta: 0:02:27 lr: 0.000006 grad: 0.2007 (0.3544) loss: 0.7772 (0.7839) time: 0.2669 data: 0.0001 max mem: 26157 Train: [86] [5800/6250] eta: 0:02:00 lr: 0.000006 grad: 0.2026 (0.3542) loss: 0.7832 (0.7839) time: 0.2664 data: 0.0001 max mem: 26157 Train: [86] [5900/6250] eta: 0:01:33 lr: 0.000006 grad: 0.2208 (0.3543) loss: 0.7850 (0.7839) time: 0.2667 data: 0.0001 max mem: 26157 Train: [86] [6000/6250] eta: 0:01:06 lr: 0.000006 grad: 0.2165 (0.3543) loss: 0.7773 (0.7838) time: 0.2675 data: 0.0001 max mem: 26157 Train: [86] [6100/6250] eta: 0:00:40 lr: 0.000006 grad: 0.2011 (0.3571) loss: 0.7866 (0.7838) time: 0.2678 data: 0.0001 max mem: 26157 Train: [86] [6200/6250] eta: 0:00:13 lr: 0.000006 grad: 0.2085 (0.3596) loss: 0.7794 (0.7838) time: 0.2674 data: 0.0002 max mem: 26157 Train: [86] [6249/6250] eta: 0:00:00 lr: 0.000006 grad: 0.2125 (0.3593) loss: 0.7812 (0.7838) time: 0.2665 data: 0.0001 max mem: 26157 Train: [86] Total time: 0:27:56 (0.2682 s / it) Averaged stats: lr: 0.000006 grad: 0.2125 (0.3593) loss: 0.7812 (0.7838) Eval (hcp-train-subset): [86] [ 0/62] eta: 0:03:21 loss: 0.8007 (0.8007) time: 3.2424 data: 3.1408 max mem: 26157 Eval (hcp-train-subset): [86] [61/62] eta: 0:00:00 loss: 0.7906 (0.7926) time: 0.0952 data: 0.0124 max mem: 26157 Eval (hcp-train-subset): [86] Total time: 0:00:11 (0.1795 s / it) Averaged stats (hcp-train-subset): loss: 0.7906 (0.7926) Making plots (hcp-train-subset): example=26 Eval (hcp-val): [86] [ 0/62] eta: 0:04:24 loss: 0.8207 (0.8207) time: 4.2687 data: 4.1856 max mem: 26157 Eval (hcp-val): [86] [61/62] eta: 0:00:00 loss: 0.8233 (0.8241) time: 0.0821 data: 0.0001 max mem: 26157 Eval (hcp-val): [86] Total time: 0:00:10 (0.1708 s / it) Averaged stats (hcp-val): loss: 0.8233 (0.8241) Making plots (hcp-val): example=18 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [87] [ 0/6250] eta: 11:16:29 lr: 0.000006 grad: 0.1430 (0.1430) loss: 0.8272 (0.8272) time: 6.4943 data: 6.2235 max mem: 26157 Train: [87] [ 100/6250] eta: 0:33:47 lr: 0.000006 grad: 0.3013 (0.6362) loss: 0.7544 (0.7798) time: 0.2664 data: 0.0001 max mem: 26157 Train: [87] [ 200/6250] eta: 0:30:07 lr: 0.000006 grad: 0.2107 (0.5314) loss: 0.7786 (0.7785) time: 0.2675 data: 0.0002 max mem: 26157 Train: [87] [ 300/6250] eta: 0:28:34 lr: 0.000006 grad: 0.2003 (0.4664) loss: 0.7939 (0.7819) time: 0.2661 data: 0.0001 max mem: 26157 Train: [87] [ 400/6250] eta: 0:27:34 lr: 0.000006 grad: 0.2146 (0.4459) loss: 0.7851 (0.7831) time: 0.2674 data: 0.0001 max mem: 26157 Train: [87] [ 500/6250] eta: 0:26:48 lr: 0.000006 grad: 0.2339 (0.4349) loss: 0.7768 (0.7822) time: 0.2676 data: 0.0001 max mem: 26157 Train: [87] [ 600/6250] eta: 0:26:08 lr: 0.000006 grad: 0.2142 (0.4241) loss: 0.7832 (0.7820) time: 0.2671 data: 0.0001 max mem: 26157 Train: [87] [ 700/6250] eta: 0:25:32 lr: 0.000006 grad: 0.2139 (0.4278) loss: 0.7761 (0.7819) time: 0.2667 data: 0.0001 max mem: 26157 Train: [87] [ 800/6250] eta: 0:24:58 lr: 0.000006 grad: 0.2040 (0.4405) loss: 0.7909 (0.7820) time: 0.2672 data: 0.0001 max mem: 26157 Train: [87] [ 900/6250] eta: 0:24:26 lr: 0.000006 grad: 0.2044 (0.4371) loss: 0.7818 (0.7824) time: 0.2675 data: 0.0001 max mem: 26157 Train: [87] [1000/6250] eta: 0:23:55 lr: 0.000006 grad: 0.1947 (0.4322) loss: 0.7899 (0.7828) time: 0.2668 data: 0.0001 max mem: 26157 Train: [87] [1100/6250] eta: 0:23:25 lr: 0.000006 grad: 0.2184 (0.4149) loss: 0.7765 (0.7830) time: 0.2660 data: 0.0001 max mem: 26157 Train: [87] [1200/6250] eta: 0:22:55 lr: 0.000006 grad: 0.2153 (0.4112) loss: 0.7826 (0.7829) time: 0.2678 data: 0.0002 max mem: 26157 Train: [87] [1300/6250] eta: 0:22:26 lr: 0.000006 grad: 0.2210 (0.4074) loss: 0.7839 (0.7830) time: 0.2663 data: 0.0001 max mem: 26157 Train: [87] [1400/6250] eta: 0:21:57 lr: 0.000005 grad: 0.1938 (0.4046) loss: 0.7856 (0.7831) time: 0.2668 data: 0.0001 max mem: 26157 Train: [87] [1500/6250] eta: 0:21:28 lr: 0.000005 grad: 0.2122 (0.4030) loss: 0.7803 (0.7830) time: 0.2666 data: 0.0001 max mem: 26157 Train: [87] [1600/6250] eta: 0:21:00 lr: 0.000005 grad: 0.1994 (0.4128) loss: 0.7831 (0.7830) time: 0.2664 data: 0.0001 max mem: 26157 Train: [87] [1700/6250] eta: 0:20:32 lr: 0.000005 grad: 0.2110 (0.4034) loss: 0.7807 (0.7831) time: 0.2677 data: 0.0001 max mem: 26157 Train: [87] [1800/6250] eta: 0:20:04 lr: 0.000005 grad: 0.2030 (0.4169) loss: 0.7807 (0.7830) time: 0.2662 data: 0.0001 max mem: 26157 Train: [87] [1900/6250] eta: 0:19:36 lr: 0.000005 grad: 0.2059 (0.4114) loss: 0.7800 (0.7830) time: 0.2667 data: 0.0001 max mem: 26157 Train: [87] [2000/6250] eta: 0:19:08 lr: 0.000005 grad: 0.2005 (0.4072) loss: 0.7890 (0.7829) time: 0.2673 data: 0.0001 max mem: 26157 Train: [87] [2100/6250] eta: 0:18:40 lr: 0.000005 grad: 0.2072 (0.4109) loss: 0.7845 (0.7829) time: 0.2667 data: 0.0001 max mem: 26157 Train: [87] [2200/6250] eta: 0:18:13 lr: 0.000005 grad: 0.1876 (0.4129) loss: 0.7858 (0.7829) time: 0.2663 data: 0.0001 max mem: 26157 Train: [87] [2300/6250] eta: 0:17:45 lr: 0.000005 grad: 0.1984 (0.4140) loss: 0.7787 (0.7829) time: 0.2684 data: 0.0001 max mem: 26157 Train: [87] [2400/6250] eta: 0:17:18 lr: 0.000005 grad: 0.2349 (0.4092) loss: 0.7865 (0.7831) time: 0.2667 data: 0.0001 max mem: 26157 Train: [87] [2500/6250] eta: 0:16:50 lr: 0.000005 grad: 0.1978 (0.4079) loss: 0.7800 (0.7831) time: 0.2668 data: 0.0002 max mem: 26157 Train: [87] [2600/6250] eta: 0:16:23 lr: 0.000005 grad: 0.1999 (0.4194) loss: 0.7836 (0.7831) time: 0.2665 data: 0.0001 max mem: 26157 Train: [87] [2700/6250] eta: 0:15:56 lr: 0.000005 grad: 0.2018 (0.4162) loss: 0.7822 (0.7831) time: 0.2669 data: 0.0001 max mem: 26157 Train: [87] [2800/6250] eta: 0:15:29 lr: 0.000005 grad: 0.1847 (0.4156) loss: 0.7895 (0.7832) time: 0.2681 data: 0.0001 max mem: 26157 Train: [87] [2900/6250] eta: 0:15:01 lr: 0.000005 grad: 0.2052 (0.4142) loss: 0.7751 (0.7832) time: 0.2673 data: 0.0001 max mem: 26157 Train: [87] [3000/6250] eta: 0:14:34 lr: 0.000005 grad: 0.2031 (0.4156) loss: 0.7799 (0.7832) time: 0.2661 data: 0.0001 max mem: 26157 Train: [87] [3100/6250] eta: 0:14:07 lr: 0.000005 grad: 0.2031 (0.4143) loss: 0.7768 (0.7831) time: 0.2666 data: 0.0001 max mem: 26157 Train: [87] [3200/6250] eta: 0:13:40 lr: 0.000005 grad: 0.2040 (0.4185) loss: 0.7835 (0.7831) time: 0.2659 data: 0.0001 max mem: 26157 Train: [87] [3300/6250] eta: 0:13:13 lr: 0.000005 grad: 0.1993 (0.4196) loss: 0.7824 (0.7831) time: 0.2667 data: 0.0001 max mem: 26157 Train: [87] [3400/6250] eta: 0:12:46 lr: 0.000005 grad: 0.2123 (0.4275) loss: 0.7890 (0.7832) time: 0.2671 data: 0.0001 max mem: 26157 Train: [87] [3500/6250] eta: 0:12:19 lr: 0.000005 grad: 0.1994 (0.4267) loss: 0.7885 (0.7832) time: 0.2669 data: 0.0001 max mem: 26157 Train: [87] [3600/6250] eta: 0:11:52 lr: 0.000005 grad: 0.2003 (0.4260) loss: 0.7847 (0.7833) time: 0.2675 data: 0.0001 max mem: 26157 Train: [87] [3700/6250] eta: 0:11:25 lr: 0.000005 grad: 0.1942 (0.4257) loss: 0.7905 (0.7833) time: 0.2664 data: 0.0001 max mem: 26157 Train: [87] [3800/6250] eta: 0:10:58 lr: 0.000005 grad: 0.2107 (0.4300) loss: 0.7777 (0.7833) time: 0.2668 data: 0.0001 max mem: 26157 Train: [87] [3900/6250] eta: 0:10:31 lr: 0.000005 grad: 0.2118 (0.4321) loss: 0.7707 (0.7833) time: 0.2664 data: 0.0001 max mem: 26157 Train: [87] [4000/6250] eta: 0:10:04 lr: 0.000005 grad: 0.1895 (0.4294) loss: 0.7801 (0.7831) time: 0.2660 data: 0.0001 max mem: 26157 Train: [87] [4100/6250] eta: 0:09:37 lr: 0.000005 grad: 0.2062 (0.4317) loss: 0.7774 (0.7831) time: 0.2669 data: 0.0001 max mem: 26157 Train: [87] [4200/6250] eta: 0:09:10 lr: 0.000005 grad: 0.1986 (0.4297) loss: 0.7775 (0.7830) time: 0.2670 data: 0.0001 max mem: 26157 Train: [87] [4300/6250] eta: 0:08:43 lr: 0.000005 grad: 0.2099 (0.4277) loss: 0.7781 (0.7829) time: 0.2679 data: 0.0002 max mem: 26157 Train: [87] [4400/6250] eta: 0:08:16 lr: 0.000005 grad: 0.2060 (0.4251) loss: 0.7783 (0.7828) time: 0.2665 data: 0.0001 max mem: 26157 Train: [87] [4500/6250] eta: 0:07:49 lr: 0.000005 grad: 0.2076 (0.4235) loss: 0.7778 (0.7827) time: 0.2680 data: 0.0002 max mem: 26157 Train: [87] [4600/6250] eta: 0:07:22 lr: 0.000005 grad: 0.2019 (0.4256) loss: 0.7782 (0.7826) time: 0.2673 data: 0.0001 max mem: 26157 Train: [87] [4700/6250] eta: 0:06:55 lr: 0.000005 grad: 0.2238 (0.4256) loss: 0.7750 (0.7824) time: 0.2667 data: 0.0001 max mem: 26157 Train: [87] [4800/6250] eta: 0:06:28 lr: 0.000005 grad: 0.2085 (0.4236) loss: 0.7758 (0.7824) time: 0.2679 data: 0.0001 max mem: 26157 Train: [87] [4900/6250] eta: 0:06:02 lr: 0.000005 grad: 0.2116 (0.4276) loss: 0.7779 (0.7822) time: 0.2673 data: 0.0001 max mem: 26157 Train: [87] [5000/6250] eta: 0:05:35 lr: 0.000005 grad: 0.2240 (0.4328) loss: 0.7732 (0.7821) time: 0.2666 data: 0.0001 max mem: 26157 Train: [87] [5100/6250] eta: 0:05:08 lr: 0.000005 grad: 0.2105 (0.4332) loss: 0.7754 (0.7819) time: 0.2671 data: 0.0001 max mem: 26157 Train: [87] [5200/6250] eta: 0:04:41 lr: 0.000005 grad: 0.2196 (0.4375) loss: 0.7808 (0.7819) time: 0.2667 data: 0.0001 max mem: 26157 Train: [87] [5300/6250] eta: 0:04:14 lr: 0.000005 grad: 0.2105 (0.4350) loss: 0.7783 (0.7818) time: 0.2659 data: 0.0001 max mem: 26157 Train: [87] [5400/6250] eta: 0:03:47 lr: 0.000005 grad: 0.1968 (0.4358) loss: 0.7877 (0.7818) time: 0.2663 data: 0.0001 max mem: 26157 Train: [87] [5500/6250] eta: 0:03:21 lr: 0.000005 grad: 0.2229 (0.4394) loss: 0.7814 (0.7818) time: 0.2670 data: 0.0001 max mem: 26157 Train: [87] [5600/6250] eta: 0:02:54 lr: 0.000005 grad: 0.2210 (0.4421) loss: 0.7804 (0.7818) time: 0.2674 data: 0.0002 max mem: 26157 Train: [87] [5700/6250] eta: 0:02:27 lr: 0.000005 grad: 0.1988 (0.4415) loss: 0.7788 (0.7818) time: 0.2662 data: 0.0001 max mem: 26157 Train: [87] [5800/6250] eta: 0:02:00 lr: 0.000005 grad: 0.2215 (0.4386) loss: 0.7810 (0.7817) time: 0.2663 data: 0.0001 max mem: 26157 Train: [87] [5900/6250] eta: 0:01:33 lr: 0.000005 grad: 0.2182 (0.4370) loss: 0.7787 (0.7816) time: 0.2673 data: 0.0001 max mem: 26157 Train: [87] [6000/6250] eta: 0:01:07 lr: 0.000005 grad: 0.2133 (0.4369) loss: 0.7750 (0.7815) time: 0.2665 data: 0.0001 max mem: 26157 Train: [87] [6100/6250] eta: 0:00:40 lr: 0.000005 grad: 0.2045 (0.4373) loss: 0.7723 (0.7814) time: 0.2667 data: 0.0001 max mem: 26157 Train: [87] [6200/6250] eta: 0:00:13 lr: 0.000005 grad: 0.2120 (0.4354) loss: 0.7718 (0.7813) time: 0.2682 data: 0.0001 max mem: 26157 Train: [87] [6249/6250] eta: 0:00:00 lr: 0.000005 grad: 0.2090 (0.4339) loss: 0.7788 (0.7813) time: 0.2670 data: 0.0001 max mem: 26157 Train: [87] Total time: 0:27:58 (0.2685 s / it) Averaged stats: lr: 0.000005 grad: 0.2090 (0.4339) loss: 0.7788 (0.7813) Eval (hcp-train-subset): [87] [ 0/62] eta: 0:04:07 loss: 0.8006 (0.8006) time: 3.9982 data: 3.9156 max mem: 26157 Eval (hcp-train-subset): [87] [61/62] eta: 0:00:00 loss: 0.7900 (0.7913) time: 0.0853 data: 0.0031 max mem: 26157 Eval (hcp-train-subset): [87] Total time: 0:00:10 (0.1643 s / it) Averaged stats (hcp-train-subset): loss: 0.7900 (0.7913) Making plots (hcp-train-subset): example=22 Eval (hcp-val): [87] [ 0/62] eta: 0:04:34 loss: 0.8216 (0.8216) time: 4.4227 data: 4.3397 max mem: 26157 Eval (hcp-val): [87] [61/62] eta: 0:00:00 loss: 0.8239 (0.8240) time: 0.0852 data: 0.0027 max mem: 26157 Eval (hcp-val): [87] Total time: 0:00:10 (0.1711 s / it) Averaged stats (hcp-val): loss: 0.8239 (0.8240) Making plots (hcp-val): example=28 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [88] [ 0/6250] eta: 8:57:43 lr: 0.000005 grad: 0.2054 (0.2054) loss: 0.8131 (0.8131) time: 5.1622 data: 4.8933 max mem: 26157 Train: [88] [ 100/6250] eta: 0:32:27 lr: 0.000005 grad: 0.2469 (0.3119) loss: 0.7929 (0.7917) time: 0.2688 data: 0.0002 max mem: 26157 Train: [88] [ 200/6250] eta: 0:29:27 lr: 0.000005 grad: 0.2370 (0.4498) loss: 0.7698 (0.7840) time: 0.2679 data: 0.0001 max mem: 26157 Train: [88] [ 300/6250] eta: 0:28:09 lr: 0.000005 grad: 0.2190 (0.4061) loss: 0.7789 (0.7828) time: 0.2671 data: 0.0001 max mem: 26157 Train: [88] [ 400/6250] eta: 0:27:16 lr: 0.000005 grad: 0.2137 (0.4354) loss: 0.7768 (0.7826) time: 0.2681 data: 0.0001 max mem: 26157 Train: [88] [ 500/6250] eta: 0:26:34 lr: 0.000005 grad: 0.1988 (0.4086) loss: 0.7898 (0.7830) time: 0.2670 data: 0.0002 max mem: 26157 Train: [88] [ 600/6250] eta: 0:25:56 lr: 0.000005 grad: 0.2108 (0.3953) loss: 0.7816 (0.7833) time: 0.2665 data: 0.0001 max mem: 26157 Train: [88] [ 700/6250] eta: 0:25:22 lr: 0.000005 grad: 0.2102 (0.3765) loss: 0.7891 (0.7837) time: 0.2668 data: 0.0001 max mem: 26157 Train: [88] [ 800/6250] eta: 0:24:50 lr: 0.000005 grad: 0.2130 (0.3679) loss: 0.7891 (0.7840) time: 0.2673 data: 0.0002 max mem: 26157 Train: [88] [ 900/6250] eta: 0:24:19 lr: 0.000005 grad: 0.2024 (0.3609) loss: 0.7845 (0.7840) time: 0.2669 data: 0.0001 max mem: 26157 Train: [88] [1000/6250] eta: 0:23:49 lr: 0.000005 grad: 0.2127 (0.3645) loss: 0.7843 (0.7843) time: 0.2674 data: 0.0002 max mem: 26157 Train: [88] [1100/6250] eta: 0:23:19 lr: 0.000005 grad: 0.1998 (0.3711) loss: 0.7877 (0.7842) time: 0.2668 data: 0.0002 max mem: 26157 Train: [88] [1200/6250] eta: 0:22:50 lr: 0.000005 grad: 0.2011 (0.3757) loss: 0.7869 (0.7842) time: 0.2665 data: 0.0001 max mem: 26157 Train: [88] [1300/6250] eta: 0:22:21 lr: 0.000005 grad: 0.1936 (0.3770) loss: 0.7872 (0.7841) time: 0.2670 data: 0.0001 max mem: 26157 Train: [88] [1400/6250] eta: 0:21:53 lr: 0.000005 grad: 0.2050 (0.4220) loss: 0.7761 (0.7839) time: 0.2665 data: 0.0001 max mem: 26157 Train: [88] [1500/6250] eta: 0:21:24 lr: 0.000005 grad: 0.2026 (0.4247) loss: 0.7838 (0.7835) time: 0.2673 data: 0.0001 max mem: 26157 Train: [88] [1600/6250] eta: 0:20:56 lr: 0.000005 grad: 0.2004 (0.4200) loss: 0.7793 (0.7834) time: 0.2665 data: 0.0001 max mem: 26157 Train: [88] [1700/6250] eta: 0:20:28 lr: 0.000005 grad: 0.2157 (0.4251) loss: 0.7795 (0.7830) time: 0.2665 data: 0.0001 max mem: 26157 Train: [88] [1800/6250] eta: 0:20:01 lr: 0.000005 grad: 0.2080 (0.4265) loss: 0.7770 (0.7828) time: 0.2665 data: 0.0002 max mem: 26157 Train: [88] [1900/6250] eta: 0:19:33 lr: 0.000005 grad: 0.2062 (0.4284) loss: 0.7823 (0.7827) time: 0.2670 data: 0.0001 max mem: 26157 Train: [88] [2000/6250] eta: 0:19:05 lr: 0.000005 grad: 0.1998 (0.4349) loss: 0.7784 (0.7826) time: 0.2670 data: 0.0002 max mem: 26157 Train: [88] [2100/6250] eta: 0:18:38 lr: 0.000005 grad: 0.2039 (0.4275) loss: 0.7823 (0.7827) time: 0.2673 data: 0.0002 max mem: 26157 Train: [88] [2200/6250] eta: 0:18:11 lr: 0.000005 grad: 0.1980 (0.4217) loss: 0.7911 (0.7830) time: 0.2670 data: 0.0001 max mem: 26157 Train: [88] [2300/6250] eta: 0:17:43 lr: 0.000005 grad: 0.2151 (0.4152) loss: 0.7767 (0.7830) time: 0.2671 data: 0.0001 max mem: 26157 Train: [88] [2400/6250] eta: 0:17:16 lr: 0.000005 grad: 0.2039 (0.4098) loss: 0.7933 (0.7832) time: 0.2678 data: 0.0002 max mem: 26157 Train: [88] [2500/6250] eta: 0:16:49 lr: 0.000005 grad: 0.1939 (0.4078) loss: 0.7890 (0.7833) time: 0.2675 data: 0.0002 max mem: 26157 Train: [88] [2600/6250] eta: 0:16:21 lr: 0.000005 grad: 0.2028 (0.4031) loss: 0.7863 (0.7835) time: 0.2668 data: 0.0001 max mem: 26157 Train: [88] [2700/6250] eta: 0:15:54 lr: 0.000005 grad: 0.2150 (0.3984) loss: 0.7845 (0.7836) time: 0.2669 data: 0.0001 max mem: 26157 Train: [88] [2800/6250] eta: 0:15:27 lr: 0.000005 grad: 0.2063 (0.3936) loss: 0.7861 (0.7836) time: 0.2670 data: 0.0001 max mem: 26157 Train: [88] [2900/6250] eta: 0:15:00 lr: 0.000004 grad: 0.2076 (0.3966) loss: 0.7874 (0.7837) time: 0.2674 data: 0.0002 max mem: 26157 Train: [88] [3000/6250] eta: 0:14:33 lr: 0.000004 grad: 0.2177 (0.3921) loss: 0.7763 (0.7838) time: 0.2662 data: 0.0001 max mem: 26157 Train: [88] [3100/6250] eta: 0:14:06 lr: 0.000004 grad: 0.2093 (0.3903) loss: 0.7861 (0.7839) time: 0.2667 data: 0.0001 max mem: 26157 Train: [88] [3200/6250] eta: 0:13:39 lr: 0.000004 grad: 0.2196 (0.3902) loss: 0.7815 (0.7839) time: 0.2664 data: 0.0001 max mem: 26157 Train: [88] [3300/6250] eta: 0:13:12 lr: 0.000004 grad: 0.2062 (0.3917) loss: 0.7788 (0.7840) time: 0.2672 data: 0.0002 max mem: 26157 Train: [88] [3400/6250] eta: 0:12:45 lr: 0.000004 grad: 0.2009 (0.3901) loss: 0.7878 (0.7840) time: 0.2664 data: 0.0001 max mem: 26157 Train: [88] [3500/6250] eta: 0:12:18 lr: 0.000004 grad: 0.2152 (0.3895) loss: 0.7831 (0.7839) time: 0.2668 data: 0.0001 max mem: 26157 Train: [88] [3600/6250] eta: 0:11:51 lr: 0.000004 grad: 0.2165 (0.3917) loss: 0.7812 (0.7839) time: 0.2666 data: 0.0001 max mem: 26157 Train: [88] [3700/6250] eta: 0:11:24 lr: 0.000004 grad: 0.2211 (0.3894) loss: 0.7842 (0.7838) time: 0.2661 data: 0.0001 max mem: 26157 Train: [88] [3800/6250] eta: 0:10:57 lr: 0.000004 grad: 0.2010 (0.3890) loss: 0.7750 (0.7838) time: 0.2671 data: 0.0002 max mem: 26157 Train: [88] [3900/6250] eta: 0:10:30 lr: 0.000004 grad: 0.2098 (0.3873) loss: 0.7783 (0.7837) time: 0.2671 data: 0.0001 max mem: 26157 Train: [88] [4000/6250] eta: 0:10:03 lr: 0.000004 grad: 0.2024 (0.3862) loss: 0.7804 (0.7836) time: 0.2660 data: 0.0001 max mem: 26157 Train: [88] [4100/6250] eta: 0:09:36 lr: 0.000004 grad: 0.2036 (0.3865) loss: 0.7773 (0.7835) time: 0.2662 data: 0.0001 max mem: 26157 Train: [88] [4200/6250] eta: 0:09:09 lr: 0.000004 grad: 0.2092 (0.3857) loss: 0.7836 (0.7834) time: 0.2673 data: 0.0001 max mem: 26157 Train: [88] [4300/6250] eta: 0:08:42 lr: 0.000004 grad: 0.2148 (0.3869) loss: 0.7761 (0.7833) time: 0.2678 data: 0.0002 max mem: 26157 Train: [88] [4400/6250] eta: 0:08:16 lr: 0.000004 grad: 0.2019 (0.3902) loss: 0.7822 (0.7832) time: 0.2670 data: 0.0002 max mem: 26157 Train: [88] [4500/6250] eta: 0:07:49 lr: 0.000004 grad: 0.2156 (0.3927) loss: 0.7707 (0.7831) time: 0.2668 data: 0.0002 max mem: 26157 Train: [88] [4600/6250] eta: 0:07:22 lr: 0.000004 grad: 0.2126 (0.3953) loss: 0.7745 (0.7830) time: 0.2669 data: 0.0001 max mem: 26157 Train: [88] [4700/6250] eta: 0:06:55 lr: 0.000004 grad: 0.2039 (0.3934) loss: 0.7827 (0.7829) time: 0.2667 data: 0.0001 max mem: 26157 Train: [88] [4800/6250] eta: 0:06:28 lr: 0.000004 grad: 0.2096 (0.3903) loss: 0.7857 (0.7829) time: 0.2675 data: 0.0001 max mem: 26157 Train: [88] [4900/6250] eta: 0:06:01 lr: 0.000004 grad: 0.2061 (0.3882) loss: 0.7916 (0.7828) time: 0.2675 data: 0.0001 max mem: 26157 Train: [88] [5000/6250] eta: 0:05:34 lr: 0.000004 grad: 0.2112 (0.3865) loss: 0.7850 (0.7828) time: 0.2666 data: 0.0001 max mem: 26157 Train: [88] [5100/6250] eta: 0:05:08 lr: 0.000004 grad: 0.2102 (0.3843) loss: 0.7862 (0.7828) time: 0.2660 data: 0.0002 max mem: 26157 Train: [88] [5200/6250] eta: 0:04:41 lr: 0.000004 grad: 0.2140 (0.3865) loss: 0.7769 (0.7828) time: 0.2673 data: 0.0001 max mem: 26157 Train: [88] [5300/6250] eta: 0:04:14 lr: 0.000004 grad: 0.2083 (0.3898) loss: 0.7759 (0.7827) time: 0.2669 data: 0.0001 max mem: 26157 Train: [88] [5400/6250] eta: 0:03:47 lr: 0.000004 grad: 0.2342 (0.3902) loss: 0.7795 (0.7827) time: 0.2672 data: 0.0001 max mem: 26157 Train: [88] [5500/6250] eta: 0:03:20 lr: 0.000004 grad: 0.1999 (0.3890) loss: 0.7817 (0.7826) time: 0.2663 data: 0.0001 max mem: 26157 Train: [88] [5600/6250] eta: 0:02:54 lr: 0.000004 grad: 0.2133 (0.3865) loss: 0.7788 (0.7826) time: 0.2671 data: 0.0001 max mem: 26157 Train: [88] [5700/6250] eta: 0:02:27 lr: 0.000004 grad: 0.2049 (0.3858) loss: 0.7772 (0.7826) time: 0.2667 data: 0.0001 max mem: 26157 Train: [88] [5800/6250] eta: 0:02:00 lr: 0.000004 grad: 0.2036 (0.3872) loss: 0.7801 (0.7826) time: 0.2672 data: 0.0002 max mem: 26157 Train: [88] [5900/6250] eta: 0:01:33 lr: 0.000004 grad: 0.1936 (0.3863) loss: 0.7879 (0.7826) time: 0.2674 data: 0.0002 max mem: 26157 Train: [88] [6000/6250] eta: 0:01:06 lr: 0.000004 grad: 0.2162 (0.3846) loss: 0.7737 (0.7826) time: 0.2663 data: 0.0001 max mem: 26157 Train: [88] [6100/6250] eta: 0:00:40 lr: 0.000004 grad: 0.2133 (0.3839) loss: 0.7831 (0.7826) time: 0.2670 data: 0.0001 max mem: 26157 Train: [88] [6200/6250] eta: 0:00:13 lr: 0.000004 grad: 0.1954 (0.3872) loss: 0.7881 (0.7826) time: 0.2665 data: 0.0001 max mem: 26157 Train: [88] [6249/6250] eta: 0:00:00 lr: 0.000004 grad: 0.2007 (0.3865) loss: 0.7816 (0.7826) time: 0.2666 data: 0.0001 max mem: 26157 Train: [88] Total time: 0:27:57 (0.2683 s / it) Averaged stats: lr: 0.000004 grad: 0.2007 (0.3865) loss: 0.7816 (0.7826) Eval (hcp-train-subset): [88] [ 0/62] eta: 0:03:36 loss: 0.7942 (0.7942) time: 3.4842 data: 3.3710 max mem: 26157 Eval (hcp-train-subset): [88] [61/62] eta: 0:00:00 loss: 0.7857 (0.7888) time: 0.0823 data: 0.0001 max mem: 26157 Eval (hcp-train-subset): [88] Total time: 0:00:10 (0.1689 s / it) Averaged stats (hcp-train-subset): loss: 0.7857 (0.7888) Making plots (hcp-train-subset): example=10 Eval (hcp-val): [88] [ 0/62] eta: 0:04:47 loss: 0.8206 (0.8206) time: 4.6381 data: 4.5553 max mem: 26157 Eval (hcp-val): [88] [61/62] eta: 0:00:00 loss: 0.8222 (0.8230) time: 0.1171 data: 0.0345 max mem: 26157 Eval (hcp-val): [88] Total time: 0:00:11 (0.1820 s / it) Averaged stats (hcp-val): loss: 0.8222 (0.8230) Making plots (hcp-val): example=29 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [89] [ 0/6250] eta: 8:23:11 lr: 0.000004 grad: 0.2737 (0.2737) loss: 0.8347 (0.8347) time: 4.8307 data: 4.5478 max mem: 26157 Train: [89] [ 100/6250] eta: 0:32:46 lr: 0.000004 grad: 0.2297 (0.3430) loss: 0.8056 (0.7989) time: 0.2673 data: 0.0001 max mem: 26157 Train: [89] [ 200/6250] eta: 0:29:38 lr: 0.000004 grad: 0.2435 (0.3447) loss: 0.7752 (0.7900) time: 0.2666 data: 0.0001 max mem: 26157 Train: [89] [ 300/6250] eta: 0:28:17 lr: 0.000004 grad: 0.2206 (0.3235) loss: 0.7934 (0.7887) time: 0.2678 data: 0.0001 max mem: 26157 Train: [89] [ 400/6250] eta: 0:27:21 lr: 0.000004 grad: 0.2509 (0.3409) loss: 0.7793 (0.7869) time: 0.2671 data: 0.0001 max mem: 26157 Train: [89] [ 500/6250] eta: 0:26:37 lr: 0.000004 grad: 0.2261 (0.3403) loss: 0.7755 (0.7859) time: 0.2670 data: 0.0002 max mem: 26157 Train: [89] [ 600/6250] eta: 0:25:59 lr: 0.000004 grad: 0.2411 (0.3443) loss: 0.7823 (0.7856) time: 0.2669 data: 0.0002 max mem: 26157 Train: [89] [ 700/6250] eta: 0:25:24 lr: 0.000004 grad: 0.2092 (0.3705) loss: 0.7795 (0.7847) time: 0.2666 data: 0.0001 max mem: 26157 Train: [89] [ 800/6250] eta: 0:24:52 lr: 0.000004 grad: 0.2312 (0.3567) loss: 0.7775 (0.7837) time: 0.2674 data: 0.0001 max mem: 26157 Train: [89] [ 900/6250] eta: 0:24:21 lr: 0.000004 grad: 0.2224 (0.3469) loss: 0.7716 (0.7831) time: 0.2681 data: 0.0002 max mem: 26157 Train: [89] [1000/6250] eta: 0:23:50 lr: 0.000004 grad: 0.2129 (0.3546) loss: 0.7815 (0.7827) time: 0.2667 data: 0.0002 max mem: 26157 Train: [89] [1100/6250] eta: 0:23:20 lr: 0.000004 grad: 0.2029 (0.3556) loss: 0.7825 (0.7825) time: 0.2666 data: 0.0001 max mem: 26157 Train: [89] [1200/6250] eta: 0:22:51 lr: 0.000004 grad: 0.2047 (0.3750) loss: 0.7856 (0.7824) time: 0.2681 data: 0.0001 max mem: 26157 Train: [89] [1300/6250] eta: 0:22:22 lr: 0.000004 grad: 0.2101 (0.3679) loss: 0.7814 (0.7821) time: 0.2663 data: 0.0001 max mem: 26157 Train: [89] [1400/6250] eta: 0:21:54 lr: 0.000004 grad: 0.2078 (0.3651) loss: 0.7821 (0.7819) time: 0.2674 data: 0.0001 max mem: 26157 Train: [89] [1500/6250] eta: 0:21:25 lr: 0.000004 grad: 0.2079 (0.3731) loss: 0.7787 (0.7816) time: 0.2666 data: 0.0001 max mem: 26157 Train: [89] [1600/6250] eta: 0:20:57 lr: 0.000004 grad: 0.2145 (0.3660) loss: 0.7815 (0.7816) time: 0.2672 data: 0.0002 max mem: 26157 Train: [89] [1700/6250] eta: 0:20:29 lr: 0.000004 grad: 0.2037 (0.3633) loss: 0.7789 (0.7814) time: 0.2673 data: 0.0002 max mem: 26157 Train: [89] [1800/6250] eta: 0:20:01 lr: 0.000004 grad: 0.2256 (0.3631) loss: 0.7750 (0.7812) time: 0.2667 data: 0.0002 max mem: 26157 Train: [89] [1900/6250] eta: 0:19:34 lr: 0.000004 grad: 0.1991 (0.3793) loss: 0.7856 (0.7813) time: 0.2663 data: 0.0001 max mem: 26157 Train: [89] [2000/6250] eta: 0:19:06 lr: 0.000004 grad: 0.2087 (0.3730) loss: 0.7804 (0.7811) time: 0.2667 data: 0.0001 max mem: 26157 Train: [89] [2100/6250] eta: 0:18:39 lr: 0.000004 grad: 0.2081 (0.3835) loss: 0.7758 (0.7809) time: 0.2679 data: 0.0001 max mem: 26157 Train: [89] [2200/6250] eta: 0:18:11 lr: 0.000004 grad: 0.2154 (0.3772) loss: 0.7828 (0.7808) time: 0.2669 data: 0.0001 max mem: 26157 Train: [89] [2300/6250] eta: 0:17:44 lr: 0.000004 grad: 0.2014 (0.3736) loss: 0.7792 (0.7807) time: 0.2669 data: 0.0001 max mem: 26157 Train: [89] [2400/6250] eta: 0:17:16 lr: 0.000004 grad: 0.2141 (0.3708) loss: 0.7822 (0.7807) time: 0.2666 data: 0.0002 max mem: 26157 Train: [89] [2500/6250] eta: 0:16:49 lr: 0.000004 grad: 0.2033 (0.3683) loss: 0.7772 (0.7806) time: 0.2662 data: 0.0001 max mem: 26157 Train: [89] [2600/6250] eta: 0:16:22 lr: 0.000004 grad: 0.2218 (0.3648) loss: 0.7750 (0.7805) time: 0.2667 data: 0.0001 max mem: 26157 Train: [89] [2700/6250] eta: 0:15:55 lr: 0.000004 grad: 0.2143 (0.3636) loss: 0.7781 (0.7804) time: 0.2671 data: 0.0001 max mem: 26157 Train: [89] [2800/6250] eta: 0:15:28 lr: 0.000004 grad: 0.2066 (0.3599) loss: 0.7800 (0.7803) time: 0.2675 data: 0.0001 max mem: 26157 Train: [89] [2900/6250] eta: 0:15:00 lr: 0.000004 grad: 0.2050 (0.3605) loss: 0.7860 (0.7803) time: 0.2663 data: 0.0001 max mem: 26157 Train: [89] [3000/6250] eta: 0:14:33 lr: 0.000004 grad: 0.2116 (0.3716) loss: 0.7803 (0.7803) time: 0.2678 data: 0.0001 max mem: 26157 Train: [89] [3100/6250] eta: 0:14:06 lr: 0.000004 grad: 0.2052 (0.3686) loss: 0.7773 (0.7802) time: 0.2670 data: 0.0001 max mem: 26157 Train: [89] [3200/6250] eta: 0:13:39 lr: 0.000004 grad: 0.2125 (0.3673) loss: 0.7863 (0.7803) time: 0.2688 data: 0.0002 max mem: 26157 Train: [89] [3300/6250] eta: 0:13:12 lr: 0.000004 grad: 0.2135 (0.3657) loss: 0.7725 (0.7802) time: 0.2667 data: 0.0001 max mem: 26157 Train: [89] [3400/6250] eta: 0:12:45 lr: 0.000004 grad: 0.2059 (0.3726) loss: 0.7782 (0.7801) time: 0.2665 data: 0.0001 max mem: 26157 Train: [89] [3500/6250] eta: 0:12:18 lr: 0.000004 grad: 0.2067 (0.3760) loss: 0.7804 (0.7801) time: 0.2665 data: 0.0001 max mem: 26157 Train: [89] [3600/6250] eta: 0:11:51 lr: 0.000004 grad: 0.2005 (0.3725) loss: 0.7842 (0.7801) time: 0.2671 data: 0.0001 max mem: 26157 Train: [89] [3700/6250] eta: 0:11:24 lr: 0.000004 grad: 0.2239 (0.3695) loss: 0.7762 (0.7802) time: 0.2660 data: 0.0001 max mem: 26157 Train: [89] [3800/6250] eta: 0:10:57 lr: 0.000004 grad: 0.2148 (0.3682) loss: 0.7783 (0.7801) time: 0.2677 data: 0.0001 max mem: 26157 Train: [89] [3900/6250] eta: 0:10:30 lr: 0.000004 grad: 0.1975 (0.3675) loss: 0.7825 (0.7802) time: 0.2672 data: 0.0001 max mem: 26157 Train: [89] [4000/6250] eta: 0:10:03 lr: 0.000004 grad: 0.2237 (0.3669) loss: 0.7748 (0.7801) time: 0.2665 data: 0.0001 max mem: 26157 Train: [89] [4100/6250] eta: 0:09:37 lr: 0.000004 grad: 0.2143 (0.3682) loss: 0.7795 (0.7801) time: 0.2670 data: 0.0001 max mem: 26157 Train: [89] [4200/6250] eta: 0:09:10 lr: 0.000004 grad: 0.2061 (0.3665) loss: 0.7830 (0.7801) time: 0.2661 data: 0.0001 max mem: 26157 Train: [89] [4300/6250] eta: 0:08:43 lr: 0.000004 grad: 0.2248 (0.3659) loss: 0.7808 (0.7801) time: 0.2669 data: 0.0001 max mem: 26157 Train: [89] [4400/6250] eta: 0:08:16 lr: 0.000004 grad: 0.2271 (0.3708) loss: 0.7761 (0.7802) time: 0.2660 data: 0.0001 max mem: 26157 Train: [89] [4500/6250] eta: 0:07:49 lr: 0.000004 grad: 0.2101 (0.3727) loss: 0.7842 (0.7803) time: 0.2663 data: 0.0001 max mem: 26157 Train: [89] [4600/6250] eta: 0:07:22 lr: 0.000004 grad: 0.2057 (0.3727) loss: 0.7882 (0.7803) time: 0.2672 data: 0.0001 max mem: 26157 Train: [89] [4700/6250] eta: 0:06:55 lr: 0.000004 grad: 0.2180 (0.3752) loss: 0.7817 (0.7804) time: 0.2661 data: 0.0001 max mem: 26157 Train: [89] [4800/6250] eta: 0:06:28 lr: 0.000004 grad: 0.2090 (0.3761) loss: 0.7835 (0.7805) time: 0.2668 data: 0.0001 max mem: 26157 Train: [89] [4900/6250] eta: 0:06:02 lr: 0.000004 grad: 0.2141 (0.3749) loss: 0.7790 (0.7806) time: 0.2665 data: 0.0001 max mem: 26157 Train: [89] [5000/6250] eta: 0:05:35 lr: 0.000004 grad: 0.1925 (0.3741) loss: 0.7895 (0.7807) time: 0.2666 data: 0.0001 max mem: 26157 Train: [89] [5100/6250] eta: 0:05:08 lr: 0.000004 grad: 0.2124 (0.3751) loss: 0.7733 (0.7807) time: 0.2663 data: 0.0001 max mem: 26157 Train: [89] [5200/6250] eta: 0:04:41 lr: 0.000003 grad: 0.2136 (0.3759) loss: 0.7848 (0.7808) time: 0.2681 data: 0.0001 max mem: 26157 Train: [89] [5300/6250] eta: 0:04:14 lr: 0.000003 grad: 0.2231 (0.3761) loss: 0.7805 (0.7809) time: 0.2669 data: 0.0001 max mem: 26157 Train: [89] [5400/6250] eta: 0:03:47 lr: 0.000003 grad: 0.2047 (0.3764) loss: 0.7835 (0.7809) time: 0.2667 data: 0.0001 max mem: 26157 Train: [89] [5500/6250] eta: 0:03:21 lr: 0.000003 grad: 0.2093 (0.3751) loss: 0.7810 (0.7809) time: 0.2674 data: 0.0001 max mem: 26157 Train: [89] [5600/6250] eta: 0:02:54 lr: 0.000003 grad: 0.2127 (0.3742) loss: 0.7842 (0.7810) time: 0.2667 data: 0.0001 max mem: 26157 Train: [89] [5700/6250] eta: 0:02:27 lr: 0.000003 grad: 0.2097 (0.3735) loss: 0.7827 (0.7811) time: 0.2675 data: 0.0001 max mem: 26157 Train: [89] [5800/6250] eta: 0:02:00 lr: 0.000003 grad: 0.2165 (0.3740) loss: 0.7827 (0.7812) time: 0.2674 data: 0.0002 max mem: 26157 Train: [89] [5900/6250] eta: 0:01:33 lr: 0.000003 grad: 0.1997 (0.3747) loss: 0.7858 (0.7813) time: 0.2666 data: 0.0001 max mem: 26157 Train: [89] [6000/6250] eta: 0:01:07 lr: 0.000003 grad: 0.2113 (0.3734) loss: 0.7904 (0.7813) time: 0.2671 data: 0.0001 max mem: 26157 Train: [89] [6100/6250] eta: 0:00:40 lr: 0.000003 grad: 0.2167 (0.3745) loss: 0.7821 (0.7814) time: 0.2665 data: 0.0001 max mem: 26157 Train: [89] [6200/6250] eta: 0:00:13 lr: 0.000003 grad: 0.2065 (0.3739) loss: 0.7856 (0.7814) time: 0.2662 data: 0.0002 max mem: 26157 Train: [89] [6249/6250] eta: 0:00:00 lr: 0.000003 grad: 0.2013 (0.3733) loss: 0.7838 (0.7814) time: 0.2696 data: 0.0002 max mem: 26157 Train: [89] Total time: 0:27:58 (0.2685 s / it) Averaged stats: lr: 0.000003 grad: 0.2013 (0.3733) loss: 0.7838 (0.7814) Eval (hcp-train-subset): [89] [ 0/62] eta: 0:04:29 loss: 0.7939 (0.7939) time: 4.3428 data: 4.2602 max mem: 26157 Eval (hcp-train-subset): [89] [61/62] eta: 0:00:00 loss: 0.7841 (0.7885) time: 0.0893 data: 0.0052 max mem: 26157 Eval (hcp-train-subset): [89] Total time: 0:00:10 (0.1678 s / it) Averaged stats (hcp-train-subset): loss: 0.7841 (0.7885) Making plots (hcp-train-subset): example=54 Eval (hcp-val): [89] [ 0/62] eta: 0:02:55 loss: 0.8190 (0.8190) time: 2.8236 data: 2.7240 max mem: 26157 Eval (hcp-val): [89] [61/62] eta: 0:00:00 loss: 0.8217 (0.8232) time: 0.0878 data: 0.0054 max mem: 26157 Eval (hcp-val): [89] Total time: 0:00:10 (0.1705 s / it) Averaged stats (hcp-val): loss: 0.8217 (0.8232) Making plots (hcp-val): example=33 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [90] [ 0/6250] eta: 8:46:12 lr: 0.000003 grad: 0.3723 (0.3723) loss: 0.7716 (0.7716) time: 5.0517 data: 4.7804 max mem: 26157 Train: [90] [ 100/6250] eta: 0:32:21 lr: 0.000003 grad: 0.2862 (0.4008) loss: 0.7929 (0.7899) time: 0.2688 data: 0.0001 max mem: 26157 Train: [90] [ 200/6250] eta: 0:29:27 lr: 0.000003 grad: 0.2464 (0.3802) loss: 0.7755 (0.7873) time: 0.2675 data: 0.0001 max mem: 26157 Train: [90] [ 300/6250] eta: 0:28:09 lr: 0.000003 grad: 0.2263 (0.3897) loss: 0.7899 (0.7860) time: 0.2708 data: 0.0002 max mem: 26157 Train: [90] [ 400/6250] eta: 0:27:16 lr: 0.000003 grad: 0.2346 (0.3686) loss: 0.7918 (0.7869) time: 0.2675 data: 0.0002 max mem: 26157 Train: [90] [ 500/6250] eta: 0:26:34 lr: 0.000003 grad: 0.2228 (0.3657) loss: 0.7893 (0.7871) time: 0.2680 data: 0.0002 max mem: 26157 Train: [90] [ 600/6250] eta: 0:25:57 lr: 0.000003 grad: 0.2164 (0.3497) loss: 0.7901 (0.7869) time: 0.2679 data: 0.0001 max mem: 26157 Train: [90] [ 700/6250] eta: 0:25:23 lr: 0.000003 grad: 0.2209 (0.3413) loss: 0.7776 (0.7868) time: 0.2669 data: 0.0001 max mem: 26157 Train: [90] [ 800/6250] eta: 0:24:51 lr: 0.000003 grad: 0.2125 (0.3403) loss: 0.7808 (0.7863) time: 0.2679 data: 0.0001 max mem: 26157 Train: [90] [ 900/6250] eta: 0:24:20 lr: 0.000003 grad: 0.2065 (0.3532) loss: 0.7782 (0.7863) time: 0.2665 data: 0.0001 max mem: 26157 Train: [90] [1000/6250] eta: 0:23:49 lr: 0.000003 grad: 0.2149 (0.3572) loss: 0.7820 (0.7858) time: 0.2669 data: 0.0001 max mem: 26157 Train: [90] [1100/6250] eta: 0:23:20 lr: 0.000003 grad: 0.1949 (0.3696) loss: 0.7891 (0.7855) time: 0.2665 data: 0.0001 max mem: 26157 Train: [90] [1200/6250] eta: 0:22:51 lr: 0.000003 grad: 0.2105 (0.3632) loss: 0.7900 (0.7853) time: 0.2668 data: 0.0001 max mem: 26157 Train: [90] [1300/6250] eta: 0:22:22 lr: 0.000003 grad: 0.1931 (0.3715) loss: 0.7891 (0.7854) time: 0.2667 data: 0.0001 max mem: 26157 Train: [90] [1400/6250] eta: 0:21:53 lr: 0.000003 grad: 0.2245 (0.3665) loss: 0.7837 (0.7853) time: 0.2671 data: 0.0001 max mem: 26157 Train: [90] [1500/6250] eta: 0:21:25 lr: 0.000003 grad: 0.2048 (0.3642) loss: 0.7853 (0.7853) time: 0.2669 data: 0.0001 max mem: 26157 Train: [90] [1600/6250] eta: 0:20:57 lr: 0.000003 grad: 0.2159 (0.3616) loss: 0.7819 (0.7851) time: 0.2664 data: 0.0001 max mem: 26157 Train: [90] [1700/6250] eta: 0:20:29 lr: 0.000003 grad: 0.2118 (0.3644) loss: 0.7840 (0.7849) time: 0.2672 data: 0.0001 max mem: 26157 Train: [90] [1800/6250] eta: 0:20:01 lr: 0.000003 grad: 0.2137 (0.3625) loss: 0.7775 (0.7847) time: 0.2670 data: 0.0001 max mem: 26157 Train: [90] [1900/6250] eta: 0:19:33 lr: 0.000003 grad: 0.2140 (0.3607) loss: 0.7771 (0.7844) time: 0.2671 data: 0.0001 max mem: 26157 Train: [90] [2000/6250] eta: 0:19:06 lr: 0.000003 grad: 0.2141 (0.3559) loss: 0.7792 (0.7844) time: 0.2678 data: 0.0001 max mem: 26157 Train: [90] [2100/6250] eta: 0:18:38 lr: 0.000003 grad: 0.2075 (0.3601) loss: 0.7835 (0.7842) time: 0.2669 data: 0.0001 max mem: 26157 Train: [90] [2200/6250] eta: 0:18:11 lr: 0.000003 grad: 0.2100 (0.3569) loss: 0.7878 (0.7843) time: 0.2669 data: 0.0001 max mem: 26157 Train: [90] [2300/6250] eta: 0:17:43 lr: 0.000003 grad: 0.2112 (0.3557) loss: 0.7817 (0.7842) time: 0.2667 data: 0.0001 max mem: 26157 Train: [90] [2400/6250] eta: 0:17:16 lr: 0.000003 grad: 0.2209 (0.3607) loss: 0.7827 (0.7842) time: 0.2658 data: 0.0001 max mem: 26157 Train: [90] [2500/6250] eta: 0:16:49 lr: 0.000003 grad: 0.2050 (0.3619) loss: 0.7809 (0.7841) time: 0.2667 data: 0.0001 max mem: 26157 Train: [90] [2600/6250] eta: 0:16:22 lr: 0.000003 grad: 0.2190 (0.3613) loss: 0.7851 (0.7841) time: 0.2673 data: 0.0001 max mem: 26157 Train: [90] [2700/6250] eta: 0:15:54 lr: 0.000003 grad: 0.2107 (0.3658) loss: 0.7803 (0.7842) time: 0.2671 data: 0.0001 max mem: 26157 Train: [90] [2800/6250] eta: 0:15:27 lr: 0.000003 grad: 0.2118 (0.3634) loss: 0.7812 (0.7842) time: 0.2667 data: 0.0001 max mem: 26157 Train: [90] [2900/6250] eta: 0:15:00 lr: 0.000003 grad: 0.2041 (0.3589) loss: 0.7838 (0.7842) time: 0.2669 data: 0.0001 max mem: 26157 Train: [90] [3000/6250] eta: 0:14:33 lr: 0.000003 grad: 0.2305 (0.3659) loss: 0.7873 (0.7842) time: 0.2667 data: 0.0001 max mem: 26157 Train: [90] [3100/6250] eta: 0:14:06 lr: 0.000003 grad: 0.2119 (0.3640) loss: 0.7803 (0.7841) time: 0.2670 data: 0.0001 max mem: 26157 Train: [90] [3200/6250] eta: 0:13:39 lr: 0.000003 grad: 0.2147 (0.3673) loss: 0.7817 (0.7841) time: 0.2660 data: 0.0002 max mem: 26157 Train: [90] [3300/6250] eta: 0:13:12 lr: 0.000003 grad: 0.2130 (0.3659) loss: 0.7856 (0.7841) time: 0.2677 data: 0.0001 max mem: 26157 Train: [90] [3400/6250] eta: 0:12:45 lr: 0.000003 grad: 0.2090 (0.3654) loss: 0.7857 (0.7840) time: 0.2668 data: 0.0001 max mem: 26157 Train: [90] [3500/6250] eta: 0:12:18 lr: 0.000003 grad: 0.2281 (0.3676) loss: 0.7835 (0.7840) time: 0.2707 data: 0.0002 max mem: 26157 Train: [90] [3600/6250] eta: 0:11:51 lr: 0.000003 grad: 0.2149 (0.3686) loss: 0.7839 (0.7839) time: 0.2669 data: 0.0001 max mem: 26157 Train: [90] [3700/6250] eta: 0:11:24 lr: 0.000003 grad: 0.2067 (0.3669) loss: 0.7871 (0.7839) time: 0.2664 data: 0.0001 max mem: 26157 Train: [90] [3800/6250] eta: 0:10:57 lr: 0.000003 grad: 0.1959 (0.3693) loss: 0.7856 (0.7839) time: 0.2668 data: 0.0001 max mem: 26157 Train: [90] [3900/6250] eta: 0:10:30 lr: 0.000003 grad: 0.2237 (0.3685) loss: 0.7719 (0.7838) time: 0.2659 data: 0.0001 max mem: 26157 Train: [90] [4000/6250] eta: 0:10:04 lr: 0.000003 grad: 0.2183 (0.3670) loss: 0.7850 (0.7838) time: 0.2672 data: 0.0001 max mem: 26157 Train: [90] [4100/6250] eta: 0:09:37 lr: 0.000003 grad: 0.2112 (0.3875) loss: 0.7854 (0.7837) time: 0.2671 data: 0.0001 max mem: 26157 Train: [90] [4200/6250] eta: 0:09:10 lr: 0.000003 grad: 0.2035 (0.3854) loss: 0.7787 (0.7837) time: 0.2667 data: 0.0001 max mem: 26157 Train: [90] [4300/6250] eta: 0:08:43 lr: 0.000003 grad: 0.2018 (0.3839) loss: 0.7899 (0.7836) time: 0.2666 data: 0.0001 max mem: 26157 Train: [90] [4400/6250] eta: 0:08:16 lr: 0.000003 grad: 0.2176 (0.3893) loss: 0.7807 (0.7835) time: 0.2675 data: 0.0002 max mem: 26157 Train: [90] [4500/6250] eta: 0:07:49 lr: 0.000003 grad: 0.2050 (0.3948) loss: 0.7842 (0.7835) time: 0.2669 data: 0.0001 max mem: 26157 Train: [90] [4600/6250] eta: 0:07:22 lr: 0.000003 grad: 0.2310 (0.3943) loss: 0.7780 (0.7835) time: 0.2673 data: 0.0001 max mem: 26157 Train: [90] [4700/6250] eta: 0:06:55 lr: 0.000003 grad: 0.2118 (0.3913) loss: 0.7874 (0.7834) time: 0.2663 data: 0.0001 max mem: 26157 Train: [90] [4800/6250] eta: 0:06:28 lr: 0.000003 grad: 0.2191 (0.3931) loss: 0.7796 (0.7834) time: 0.2668 data: 0.0001 max mem: 26157 Train: [90] [4900/6250] eta: 0:06:02 lr: 0.000003 grad: 0.2183 (0.3934) loss: 0.7774 (0.7834) time: 0.2668 data: 0.0001 max mem: 26157 Train: [90] [5000/6250] eta: 0:05:35 lr: 0.000003 grad: 0.2106 (0.3936) loss: 0.7836 (0.7835) time: 0.2666 data: 0.0001 max mem: 26157 Train: [90] [5100/6250] eta: 0:05:08 lr: 0.000003 grad: 0.2032 (0.3985) loss: 0.7783 (0.7834) time: 0.2658 data: 0.0001 max mem: 26157 Train: [90] [5200/6250] eta: 0:04:41 lr: 0.000003 grad: 0.2043 (0.4027) loss: 0.7859 (0.7835) time: 0.2668 data: 0.0001 max mem: 26157 Train: [90] [5300/6250] eta: 0:04:14 lr: 0.000003 grad: 0.2050 (0.4040) loss: 0.7895 (0.7834) time: 0.2667 data: 0.0001 max mem: 26157 Train: [90] [5400/6250] eta: 0:03:47 lr: 0.000003 grad: 0.2093 (0.4048) loss: 0.7830 (0.7834) time: 0.2654 data: 0.0001 max mem: 26157 Train: [90] [5500/6250] eta: 0:03:20 lr: 0.000003 grad: 0.2078 (0.4041) loss: 0.7764 (0.7833) time: 0.2666 data: 0.0001 max mem: 26157 Train: [90] [5600/6250] eta: 0:02:54 lr: 0.000003 grad: 0.2059 (0.4028) loss: 0.7847 (0.7833) time: 0.2667 data: 0.0001 max mem: 26157 Train: [90] [5700/6250] eta: 0:02:27 lr: 0.000003 grad: 0.2275 (0.4017) loss: 0.7778 (0.7832) time: 0.2675 data: 0.0001 max mem: 26157 Train: [90] [5800/6250] eta: 0:02:00 lr: 0.000003 grad: 0.2124 (0.4035) loss: 0.7756 (0.7831) time: 0.2667 data: 0.0001 max mem: 26157 Train: [90] [5900/6250] eta: 0:01:33 lr: 0.000003 grad: 0.2056 (0.4033) loss: 0.7858 (0.7831) time: 0.2671 data: 0.0001 max mem: 26157 Train: [90] [6000/6250] eta: 0:01:06 lr: 0.000003 grad: 0.2301 (0.4034) loss: 0.7870 (0.7831) time: 0.2666 data: 0.0001 max mem: 26157 Train: [90] [6100/6250] eta: 0:00:40 lr: 0.000003 grad: 0.2105 (0.4030) loss: 0.7756 (0.7830) time: 0.2675 data: 0.0001 max mem: 26157 Train: [90] [6200/6250] eta: 0:00:13 lr: 0.000003 grad: 0.2102 (0.4009) loss: 0.7814 (0.7829) time: 0.2665 data: 0.0001 max mem: 26157 Train: [90] [6249/6250] eta: 0:00:00 lr: 0.000003 grad: 0.2176 (0.4013) loss: 0.7829 (0.7829) time: 0.2672 data: 0.0001 max mem: 26157 Train: [90] Total time: 0:27:57 (0.2684 s / it) Averaged stats: lr: 0.000003 grad: 0.2176 (0.4013) loss: 0.7829 (0.7829) Eval (hcp-train-subset): [90] [ 0/62] eta: 0:04:10 loss: 0.7963 (0.7963) time: 4.0411 data: 3.9586 max mem: 26157 Eval (hcp-train-subset): [90] [61/62] eta: 0:00:00 loss: 0.7810 (0.7881) time: 0.0826 data: 0.0001 max mem: 26157 Eval (hcp-train-subset): [90] Total time: 0:00:10 (0.1726 s / it) Averaged stats (hcp-train-subset): loss: 0.7810 (0.7881) Making plots (hcp-train-subset): example=17 Eval (hcp-val): [90] [ 0/62] eta: 0:03:11 loss: 0.8200 (0.8200) time: 3.0891 data: 2.9740 max mem: 26157 Eval (hcp-val): [90] [61/62] eta: 0:00:00 loss: 0.8216 (0.8233) time: 0.1174 data: 0.0349 max mem: 26157 Eval (hcp-val): [90] Total time: 0:00:11 (0.1876 s / it) Averaged stats (hcp-val): loss: 0.8216 (0.8233) Making plots (hcp-val): example=40 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [91] [ 0/6250] eta: 9:38:52 lr: 0.000003 grad: 0.5643 (0.5643) loss: 0.7245 (0.7245) time: 5.5571 data: 5.2755 max mem: 26157 Train: [91] [ 100/6250] eta: 0:32:48 lr: 0.000003 grad: 0.2763 (0.3980) loss: 0.7876 (0.7926) time: 0.2670 data: 0.0002 max mem: 26157 Train: [91] [ 200/6250] eta: 0:29:36 lr: 0.000003 grad: 0.2734 (0.5028) loss: 0.7844 (0.7876) time: 0.2668 data: 0.0001 max mem: 26157 Train: [91] [ 300/6250] eta: 0:28:14 lr: 0.000003 grad: 0.2239 (0.4351) loss: 0.7872 (0.7849) time: 0.2681 data: 0.0001 max mem: 26157 Train: [91] [ 400/6250] eta: 0:27:20 lr: 0.000003 grad: 0.2265 (0.4643) loss: 0.7776 (0.7838) time: 0.2674 data: 0.0001 max mem: 26157 Train: [91] [ 500/6250] eta: 0:26:36 lr: 0.000003 grad: 0.2670 (0.4393) loss: 0.7800 (0.7832) time: 0.2668 data: 0.0001 max mem: 26157 Train: [91] [ 600/6250] eta: 0:25:58 lr: 0.000003 grad: 0.2418 (0.4635) loss: 0.7699 (0.7826) time: 0.2666 data: 0.0001 max mem: 26157 Train: [91] [ 700/6250] eta: 0:25:24 lr: 0.000003 grad: 0.2272 (0.4573) loss: 0.7903 (0.7829) time: 0.2668 data: 0.0001 max mem: 26157 Train: [91] [ 800/6250] eta: 0:24:51 lr: 0.000003 grad: 0.2212 (0.4393) loss: 0.7877 (0.7834) time: 0.2682 data: 0.0001 max mem: 26157 Train: [91] [ 900/6250] eta: 0:24:20 lr: 0.000003 grad: 0.2117 (0.4458) loss: 0.7851 (0.7836) time: 0.2668 data: 0.0001 max mem: 26157 Train: [91] [1000/6250] eta: 0:23:50 lr: 0.000003 grad: 0.2333 (0.4394) loss: 0.7936 (0.7838) time: 0.2717 data: 0.0002 max mem: 26157 Train: [91] [1100/6250] eta: 0:23:20 lr: 0.000003 grad: 0.2091 (0.4475) loss: 0.7845 (0.7840) time: 0.2662 data: 0.0001 max mem: 26157 Train: [91] [1200/6250] eta: 0:22:51 lr: 0.000003 grad: 0.2230 (0.4516) loss: 0.7816 (0.7841) time: 0.2659 data: 0.0002 max mem: 26157 Train: [91] [1300/6250] eta: 0:22:22 lr: 0.000003 grad: 0.2099 (0.4393) loss: 0.7899 (0.7842) time: 0.2680 data: 0.0001 max mem: 26157 Train: [91] [1400/6250] eta: 0:21:53 lr: 0.000003 grad: 0.2025 (0.4262) loss: 0.7907 (0.7843) time: 0.2666 data: 0.0001 max mem: 26157 Train: [91] [1500/6250] eta: 0:21:25 lr: 0.000003 grad: 0.2127 (0.4337) loss: 0.7843 (0.7843) time: 0.2667 data: 0.0001 max mem: 26157 Train: [91] [1600/6250] eta: 0:20:57 lr: 0.000003 grad: 0.2111 (0.4254) loss: 0.7788 (0.7843) time: 0.2665 data: 0.0001 max mem: 26157 Train: [91] [1700/6250] eta: 0:20:29 lr: 0.000003 grad: 0.2065 (0.4287) loss: 0.7874 (0.7843) time: 0.2667 data: 0.0001 max mem: 26157 Train: [91] [1800/6250] eta: 0:20:01 lr: 0.000003 grad: 0.2078 (0.4256) loss: 0.7785 (0.7841) time: 0.2684 data: 0.0002 max mem: 26157 Train: [91] [1900/6250] eta: 0:19:34 lr: 0.000003 grad: 0.2267 (0.4210) loss: 0.7801 (0.7841) time: 0.2679 data: 0.0002 max mem: 26157 Train: [91] [2000/6250] eta: 0:19:06 lr: 0.000003 grad: 0.2150 (0.4187) loss: 0.7827 (0.7841) time: 0.2663 data: 0.0001 max mem: 26157 Train: [91] [2100/6250] eta: 0:18:38 lr: 0.000003 grad: 0.2064 (0.4116) loss: 0.7891 (0.7839) time: 0.2670 data: 0.0001 max mem: 26157 Train: [91] [2200/6250] eta: 0:18:11 lr: 0.000003 grad: 0.2241 (0.4049) loss: 0.7861 (0.7838) time: 0.2676 data: 0.0001 max mem: 26157 Train: [91] [2300/6250] eta: 0:17:44 lr: 0.000003 grad: 0.2128 (0.4040) loss: 0.7800 (0.7836) time: 0.2677 data: 0.0001 max mem: 26157 Train: [91] [2400/6250] eta: 0:17:16 lr: 0.000003 grad: 0.2244 (0.4004) loss: 0.7740 (0.7834) time: 0.2667 data: 0.0002 max mem: 26157 Train: [91] [2500/6250] eta: 0:16:49 lr: 0.000003 grad: 0.2021 (0.4019) loss: 0.7844 (0.7833) time: 0.2662 data: 0.0001 max mem: 26157 Train: [91] [2600/6250] eta: 0:16:22 lr: 0.000003 grad: 0.2216 (0.4061) loss: 0.7785 (0.7832) time: 0.2675 data: 0.0001 max mem: 26157 Train: [91] [2700/6250] eta: 0:15:55 lr: 0.000002 grad: 0.2224 (0.4088) loss: 0.7811 (0.7830) time: 0.2668 data: 0.0001 max mem: 26157 Train: [91] [2800/6250] eta: 0:15:27 lr: 0.000002 grad: 0.2105 (0.4102) loss: 0.7832 (0.7830) time: 0.2675 data: 0.0001 max mem: 26157 Train: [91] [2900/6250] eta: 0:15:00 lr: 0.000002 grad: 0.2129 (0.4101) loss: 0.7808 (0.7828) time: 0.2665 data: 0.0001 max mem: 26157 Train: [91] [3000/6250] eta: 0:14:33 lr: 0.000002 grad: 0.2136 (0.4077) loss: 0.7819 (0.7827) time: 0.2661 data: 0.0001 max mem: 26157 Train: [91] [3100/6250] eta: 0:14:06 lr: 0.000002 grad: 0.2200 (0.4050) loss: 0.7743 (0.7825) time: 0.2659 data: 0.0002 max mem: 26157 Train: [91] [3200/6250] eta: 0:13:39 lr: 0.000002 grad: 0.2156 (0.4118) loss: 0.7767 (0.7824) time: 0.2666 data: 0.0001 max mem: 26157 Train: [91] [3300/6250] eta: 0:13:12 lr: 0.000002 grad: 0.2168 (0.4082) loss: 0.7762 (0.7823) time: 0.2665 data: 0.0001 max mem: 26157 Train: [91] [3400/6250] eta: 0:12:45 lr: 0.000002 grad: 0.2146 (0.4082) loss: 0.7758 (0.7822) time: 0.2681 data: 0.0001 max mem: 26157 Train: [91] [3500/6250] eta: 0:12:18 lr: 0.000002 grad: 0.2165 (0.4080) loss: 0.7750 (0.7822) time: 0.2672 data: 0.0001 max mem: 26157 Train: [91] [3600/6250] eta: 0:11:51 lr: 0.000002 grad: 0.1965 (0.4036) loss: 0.7827 (0.7821) time: 0.2671 data: 0.0001 max mem: 26157 Train: [91] [3700/6250] eta: 0:11:24 lr: 0.000002 grad: 0.2288 (0.4035) loss: 0.7730 (0.7820) time: 0.2667 data: 0.0001 max mem: 26157 Train: [91] [3800/6250] eta: 0:10:57 lr: 0.000002 grad: 0.2064 (0.4011) loss: 0.7795 (0.7820) time: 0.2667 data: 0.0001 max mem: 26157 Train: [91] [3900/6250] eta: 0:10:30 lr: 0.000002 grad: 0.2059 (0.4025) loss: 0.7720 (0.7819) time: 0.2667 data: 0.0002 max mem: 26157 Train: [91] [4000/6250] eta: 0:10:03 lr: 0.000002 grad: 0.2102 (0.3995) loss: 0.7811 (0.7818) time: 0.2680 data: 0.0002 max mem: 26157 Train: [91] [4100/6250] eta: 0:09:36 lr: 0.000002 grad: 0.2040 (0.3973) loss: 0.7846 (0.7818) time: 0.2676 data: 0.0001 max mem: 26157 Train: [91] [4200/6250] eta: 0:09:10 lr: 0.000002 grad: 0.2352 (0.3949) loss: 0.7762 (0.7817) time: 0.2669 data: 0.0001 max mem: 26157 Train: [91] [4300/6250] eta: 0:08:43 lr: 0.000002 grad: 0.2151 (0.3955) loss: 0.7867 (0.7817) time: 0.2669 data: 0.0001 max mem: 26157 Train: [91] [4400/6250] eta: 0:08:16 lr: 0.000002 grad: 0.2124 (0.3970) loss: 0.7857 (0.7818) time: 0.2675 data: 0.0001 max mem: 26157 Train: [91] [4500/6250] eta: 0:07:49 lr: 0.000002 grad: 0.2061 (0.3983) loss: 0.7819 (0.7817) time: 0.2670 data: 0.0001 max mem: 26157 Train: [91] [4600/6250] eta: 0:07:22 lr: 0.000002 grad: 0.2103 (0.3970) loss: 0.7829 (0.7817) time: 0.2667 data: 0.0001 max mem: 26157 Train: [91] [4700/6250] eta: 0:06:55 lr: 0.000002 grad: 0.2055 (0.3977) loss: 0.7780 (0.7817) time: 0.2673 data: 0.0001 max mem: 26157 Train: [91] [4800/6250] eta: 0:06:28 lr: 0.000002 grad: 0.2050 (0.4023) loss: 0.7870 (0.7818) time: 0.2665 data: 0.0001 max mem: 26157 Train: [91] [4900/6250] eta: 0:06:01 lr: 0.000002 grad: 0.2110 (0.4039) loss: 0.7843 (0.7818) time: 0.2674 data: 0.0002 max mem: 26157 Train: [91] [5000/6250] eta: 0:05:35 lr: 0.000002 grad: 0.2011 (0.4012) loss: 0.7901 (0.7819) time: 0.2666 data: 0.0001 max mem: 26157 Train: [91] [5100/6250] eta: 0:05:08 lr: 0.000002 grad: 0.2069 (0.4064) loss: 0.7835 (0.7820) time: 0.2670 data: 0.0002 max mem: 26157 Train: [91] [5200/6250] eta: 0:04:41 lr: 0.000002 grad: 0.2000 (0.4057) loss: 0.7834 (0.7821) time: 0.2662 data: 0.0001 max mem: 26157 Train: [91] [5300/6250] eta: 0:04:14 lr: 0.000002 grad: 0.2127 (0.4045) loss: 0.7800 (0.7822) time: 0.2669 data: 0.0001 max mem: 26157 Train: [91] [5400/6250] eta: 0:03:47 lr: 0.000002 grad: 0.2181 (0.4043) loss: 0.7819 (0.7823) time: 0.2664 data: 0.0001 max mem: 26157 Train: [91] [5500/6250] eta: 0:03:21 lr: 0.000002 grad: 0.2105 (0.4022) loss: 0.7782 (0.7823) time: 0.2670 data: 0.0002 max mem: 26157 Train: [91] [5600/6250] eta: 0:02:54 lr: 0.000002 grad: 0.2218 (0.4032) loss: 0.7755 (0.7822) time: 0.2670 data: 0.0001 max mem: 26157 Train: [91] [5700/6250] eta: 0:02:27 lr: 0.000002 grad: 0.2138 (0.4021) loss: 0.7804 (0.7822) time: 0.2668 data: 0.0001 max mem: 26157 Train: [91] [5800/6250] eta: 0:02:00 lr: 0.000002 grad: 0.2102 (0.4018) loss: 0.7836 (0.7822) time: 0.2666 data: 0.0001 max mem: 26157 Train: [91] [5900/6250] eta: 0:01:33 lr: 0.000002 grad: 0.2079 (0.4005) loss: 0.7837 (0.7822) time: 0.2664 data: 0.0001 max mem: 26157 Train: [91] [6000/6250] eta: 0:01:06 lr: 0.000002 grad: 0.2241 (0.3996) loss: 0.7707 (0.7822) time: 0.2668 data: 0.0001 max mem: 26157 Train: [91] [6100/6250] eta: 0:00:40 lr: 0.000002 grad: 0.2035 (0.4004) loss: 0.7835 (0.7822) time: 0.2669 data: 0.0002 max mem: 26157 Train: [91] [6200/6250] eta: 0:00:13 lr: 0.000002 grad: 0.2047 (0.4009) loss: 0.7879 (0.7822) time: 0.2675 data: 0.0001 max mem: 26157 Train: [91] [6249/6250] eta: 0:00:00 lr: 0.000002 grad: 0.1997 (0.4002) loss: 0.7841 (0.7822) time: 0.2664 data: 0.0001 max mem: 26157 Train: [91] Total time: 0:27:57 (0.2684 s / it) Averaged stats: lr: 0.000002 grad: 0.1997 (0.4002) loss: 0.7841 (0.7822) Eval (hcp-train-subset): [91] [ 0/62] eta: 0:02:55 loss: 0.7952 (0.7952) time: 2.8268 data: 2.7286 max mem: 26157 Eval (hcp-train-subset): [91] [61/62] eta: 0:00:00 loss: 0.7823 (0.7869) time: 0.0824 data: 0.0001 max mem: 26157 Eval (hcp-train-subset): [91] Total time: 0:00:10 (0.1667 s / it) Averaged stats (hcp-train-subset): loss: 0.7823 (0.7869) Making plots (hcp-train-subset): example=5 Eval (hcp-val): [91] [ 0/62] eta: 0:03:11 loss: 0.8214 (0.8214) time: 3.0829 data: 2.9857 max mem: 26157 Eval (hcp-val): [91] [61/62] eta: 0:00:00 loss: 0.8222 (0.8229) time: 0.0959 data: 0.0120 max mem: 26157 Eval (hcp-val): [91] Total time: 0:00:10 (0.1660 s / it) Averaged stats (hcp-val): loss: 0.8222 (0.8229) Making plots (hcp-val): example=57 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [92] [ 0/6250] eta: 8:41:04 lr: 0.000002 grad: 0.2067 (0.2067) loss: 0.8023 (0.8023) time: 5.0024 data: 4.7198 max mem: 26157 Train: [92] [ 100/6250] eta: 0:32:19 lr: 0.000002 grad: 0.2790 (0.3754) loss: 0.7881 (0.8006) time: 0.2691 data: 0.0001 max mem: 26157 Train: [92] [ 200/6250] eta: 0:29:26 lr: 0.000002 grad: 0.2668 (0.3669) loss: 0.7758 (0.7889) time: 0.2686 data: 0.0001 max mem: 26157 Train: [92] [ 300/6250] eta: 0:28:11 lr: 0.000002 grad: 0.2547 (0.4051) loss: 0.7848 (0.7856) time: 0.2680 data: 0.0001 max mem: 26157 Train: [92] [ 400/6250] eta: 0:27:19 lr: 0.000002 grad: 0.2117 (0.3869) loss: 0.7843 (0.7843) time: 0.2700 data: 0.0002 max mem: 26157 Train: [92] [ 500/6250] eta: 0:26:36 lr: 0.000002 grad: 0.2383 (0.3874) loss: 0.7826 (0.7835) time: 0.2671 data: 0.0001 max mem: 26157 Train: [92] [ 600/6250] eta: 0:25:58 lr: 0.000002 grad: 0.2334 (0.3932) loss: 0.7849 (0.7838) time: 0.2668 data: 0.0001 max mem: 26157 Train: [92] [ 700/6250] eta: 0:25:24 lr: 0.000002 grad: 0.2146 (0.4264) loss: 0.7755 (0.7840) time: 0.2674 data: 0.0001 max mem: 26157 Train: [92] [ 800/6250] eta: 0:24:51 lr: 0.000002 grad: 0.2179 (0.4191) loss: 0.7857 (0.7843) time: 0.2671 data: 0.0001 max mem: 26157 Train: [92] [ 900/6250] eta: 0:24:20 lr: 0.000002 grad: 0.2187 (0.4092) loss: 0.7903 (0.7846) time: 0.2664 data: 0.0001 max mem: 26157 Train: [92] [1000/6250] eta: 0:23:50 lr: 0.000002 grad: 0.2331 (0.4099) loss: 0.7818 (0.7847) time: 0.2668 data: 0.0001 max mem: 26157 Train: [92] [1100/6250] eta: 0:23:20 lr: 0.000002 grad: 0.2265 (0.4178) loss: 0.7782 (0.7847) time: 0.2670 data: 0.0001 max mem: 26157 Train: [92] [1200/6250] eta: 0:22:51 lr: 0.000002 grad: 0.2180 (0.4051) loss: 0.7843 (0.7846) time: 0.2671 data: 0.0001 max mem: 26157 Train: [92] [1300/6250] eta: 0:22:22 lr: 0.000002 grad: 0.2210 (0.3986) loss: 0.7879 (0.7843) time: 0.2669 data: 0.0001 max mem: 26157 Train: [92] [1400/6250] eta: 0:21:53 lr: 0.000002 grad: 0.2171 (0.4028) loss: 0.7828 (0.7842) time: 0.2666 data: 0.0001 max mem: 26157 Train: [92] [1500/6250] eta: 0:21:25 lr: 0.000002 grad: 0.2214 (0.3938) loss: 0.7759 (0.7840) time: 0.2674 data: 0.0001 max mem: 26157 Train: [92] [1600/6250] eta: 0:20:57 lr: 0.000002 grad: 0.2164 (0.4044) loss: 0.7840 (0.7839) time: 0.2673 data: 0.0001 max mem: 26157 Train: [92] [1700/6250] eta: 0:20:29 lr: 0.000002 grad: 0.2091 (0.3980) loss: 0.7756 (0.7836) time: 0.2675 data: 0.0002 max mem: 26157 Train: [92] [1800/6250] eta: 0:20:02 lr: 0.000002 grad: 0.2225 (0.4060) loss: 0.7774 (0.7834) time: 0.2677 data: 0.0001 max mem: 26157 Train: [92] [1900/6250] eta: 0:19:34 lr: 0.000002 grad: 0.2289 (0.4072) loss: 0.7821 (0.7835) time: 0.2670 data: 0.0001 max mem: 26157 Train: [92] [2000/6250] eta: 0:19:06 lr: 0.000002 grad: 0.2143 (0.4035) loss: 0.7850 (0.7835) time: 0.2675 data: 0.0001 max mem: 26157 Train: [92] [2100/6250] eta: 0:18:39 lr: 0.000002 grad: 0.2068 (0.4031) loss: 0.7858 (0.7836) time: 0.2688 data: 0.0002 max mem: 26157 Train: [92] [2200/6250] eta: 0:18:12 lr: 0.000002 grad: 0.2186 (0.4090) loss: 0.7794 (0.7836) time: 0.2679 data: 0.0001 max mem: 26157 Train: [92] [2300/6250] eta: 0:17:44 lr: 0.000002 grad: 0.2150 (0.4137) loss: 0.7802 (0.7835) time: 0.2669 data: 0.0001 max mem: 26157 Train: [92] [2400/6250] eta: 0:17:17 lr: 0.000002 grad: 0.1978 (0.4071) loss: 0.7813 (0.7835) time: 0.2665 data: 0.0001 max mem: 26157 Train: [92] [2500/6250] eta: 0:16:49 lr: 0.000002 grad: 0.2209 (0.4066) loss: 0.7800 (0.7835) time: 0.2668 data: 0.0002 max mem: 26157 Train: [92] [2600/6250] eta: 0:16:22 lr: 0.000002 grad: 0.2227 (0.4011) loss: 0.7804 (0.7834) time: 0.2670 data: 0.0001 max mem: 26157 Train: [92] [2700/6250] eta: 0:15:55 lr: 0.000002 grad: 0.2115 (0.3970) loss: 0.7782 (0.7834) time: 0.2662 data: 0.0001 max mem: 26157 Train: [92] [2800/6250] eta: 0:15:28 lr: 0.000002 grad: 0.2204 (0.3987) loss: 0.7843 (0.7833) time: 0.2671 data: 0.0001 max mem: 26157 Train: [92] [2900/6250] eta: 0:15:01 lr: 0.000002 grad: 0.2140 (0.3964) loss: 0.7837 (0.7833) time: 0.2665 data: 0.0001 max mem: 26157 Train: [92] [3000/6250] eta: 0:14:34 lr: 0.000002 grad: 0.2131 (0.3936) loss: 0.7811 (0.7832) time: 0.2663 data: 0.0001 max mem: 26157 Train: [92] [3100/6250] eta: 0:14:06 lr: 0.000002 grad: 0.2166 (0.3883) loss: 0.7782 (0.7833) time: 0.2674 data: 0.0001 max mem: 26157 Train: [92] [3200/6250] eta: 0:13:39 lr: 0.000002 grad: 0.2308 (0.3888) loss: 0.7812 (0.7832) time: 0.2682 data: 0.0001 max mem: 26157 Train: [92] [3300/6250] eta: 0:13:12 lr: 0.000002 grad: 0.2278 (0.3900) loss: 0.7816 (0.7832) time: 0.2674 data: 0.0001 max mem: 26157 Train: [92] [3400/6250] eta: 0:12:45 lr: 0.000002 grad: 0.2316 (0.3874) loss: 0.7811 (0.7832) time: 0.2666 data: 0.0001 max mem: 26157 Train: [92] [3500/6250] eta: 0:12:18 lr: 0.000002 grad: 0.2219 (0.3833) loss: 0.7836 (0.7832) time: 0.2712 data: 0.0002 max mem: 26157 Train: [92] [3600/6250] eta: 0:11:52 lr: 0.000002 grad: 0.2117 (0.3913) loss: 0.7840 (0.7832) time: 0.2671 data: 0.0001 max mem: 26157 Train: [92] [3700/6250] eta: 0:11:25 lr: 0.000002 grad: 0.2234 (0.3957) loss: 0.7799 (0.7832) time: 0.2663 data: 0.0001 max mem: 26157 Train: [92] [3800/6250] eta: 0:10:58 lr: 0.000002 grad: 0.2301 (0.3974) loss: 0.7757 (0.7830) time: 0.2666 data: 0.0001 max mem: 26157 Train: [92] [3900/6250] eta: 0:10:31 lr: 0.000002 grad: 0.2222 (0.3981) loss: 0.7790 (0.7830) time: 0.2671 data: 0.0001 max mem: 26157 Train: [92] [4000/6250] eta: 0:10:04 lr: 0.000002 grad: 0.2181 (0.4070) loss: 0.7872 (0.7830) time: 0.2665 data: 0.0001 max mem: 26157 Train: [92] [4100/6250] eta: 0:09:37 lr: 0.000002 grad: 0.2250 (0.4081) loss: 0.7875 (0.7829) time: 0.2672 data: 0.0001 max mem: 26157 Train: [92] [4200/6250] eta: 0:09:10 lr: 0.000002 grad: 0.2184 (0.4134) loss: 0.7786 (0.7828) time: 0.2666 data: 0.0001 max mem: 26157 Train: [92] [4300/6250] eta: 0:08:43 lr: 0.000002 grad: 0.2303 (0.4111) loss: 0.7773 (0.7827) time: 0.2664 data: 0.0001 max mem: 26157 Train: [92] [4400/6250] eta: 0:08:16 lr: 0.000002 grad: 0.2370 (0.4107) loss: 0.7738 (0.7827) time: 0.2681 data: 0.0001 max mem: 26157 Train: [92] [4500/6250] eta: 0:07:49 lr: 0.000002 grad: 0.2096 (0.4108) loss: 0.7837 (0.7826) time: 0.2663 data: 0.0001 max mem: 26157 Train: [92] [4600/6250] eta: 0:07:22 lr: 0.000002 grad: 0.2210 (0.4120) loss: 0.7794 (0.7826) time: 0.2666 data: 0.0001 max mem: 26157 Train: [92] [4700/6250] eta: 0:06:55 lr: 0.000002 grad: 0.2278 (0.4102) loss: 0.7893 (0.7826) time: 0.2672 data: 0.0001 max mem: 26157 Train: [92] [4800/6250] eta: 0:06:28 lr: 0.000002 grad: 0.2212 (0.4117) loss: 0.7798 (0.7826) time: 0.2678 data: 0.0001 max mem: 26157 Train: [92] [4900/6250] eta: 0:06:02 lr: 0.000002 grad: 0.2194 (0.4115) loss: 0.7807 (0.7826) time: 0.2670 data: 0.0001 max mem: 26157 Train: [92] [5000/6250] eta: 0:05:35 lr: 0.000002 grad: 0.2212 (0.4098) loss: 0.7768 (0.7825) time: 0.2672 data: 0.0001 max mem: 26157 Train: [92] [5100/6250] eta: 0:05:08 lr: 0.000002 grad: 0.2326 (0.4071) loss: 0.7862 (0.7825) time: 0.2668 data: 0.0001 max mem: 26157 Train: [92] [5200/6250] eta: 0:04:41 lr: 0.000002 grad: 0.2057 (0.4060) loss: 0.7870 (0.7825) time: 0.2670 data: 0.0001 max mem: 26157 Train: [92] [5300/6250] eta: 0:04:14 lr: 0.000002 grad: 0.2247 (0.4059) loss: 0.7784 (0.7824) time: 0.2687 data: 0.0001 max mem: 26157 Train: [92] [5400/6250] eta: 0:03:47 lr: 0.000002 grad: 0.2206 (0.4054) loss: 0.7762 (0.7824) time: 0.2669 data: 0.0001 max mem: 26157 Train: [92] [5500/6250] eta: 0:03:21 lr: 0.000002 grad: 0.1997 (0.4035) loss: 0.7754 (0.7823) time: 0.2670 data: 0.0001 max mem: 26157 Train: [92] [5600/6250] eta: 0:02:54 lr: 0.000002 grad: 0.2152 (0.4038) loss: 0.7761 (0.7823) time: 0.2668 data: 0.0002 max mem: 26157 Train: [92] [5700/6250] eta: 0:02:27 lr: 0.000002 grad: 0.2179 (0.4031) loss: 0.7769 (0.7822) time: 0.2669 data: 0.0001 max mem: 26157 Train: [92] [5800/6250] eta: 0:02:00 lr: 0.000002 grad: 0.2139 (0.4044) loss: 0.7820 (0.7822) time: 0.2677 data: 0.0001 max mem: 26157 Train: [92] [5900/6250] eta: 0:01:33 lr: 0.000002 grad: 0.2139 (0.4014) loss: 0.7780 (0.7821) time: 0.2668 data: 0.0001 max mem: 26157 Train: [92] [6000/6250] eta: 0:01:06 lr: 0.000002 grad: 0.2171 (0.4006) loss: 0.7752 (0.7820) time: 0.2669 data: 0.0002 max mem: 26157 Train: [92] [6100/6250] eta: 0:00:40 lr: 0.000002 grad: 0.2120 (0.3986) loss: 0.7833 (0.7819) time: 0.2673 data: 0.0001 max mem: 26157 Train: [92] [6200/6250] eta: 0:00:13 lr: 0.000002 grad: 0.2193 (0.3977) loss: 0.7738 (0.7818) time: 0.2677 data: 0.0001 max mem: 26157 Train: [92] [6249/6250] eta: 0:00:00 lr: 0.000002 grad: 0.2134 (0.3969) loss: 0.7722 (0.7817) time: 0.2666 data: 0.0001 max mem: 26157 Train: [92] Total time: 0:27:58 (0.2685 s / it) Averaged stats: lr: 0.000002 grad: 0.2134 (0.3969) loss: 0.7722 (0.7817) Eval (hcp-train-subset): [92] [ 0/62] eta: 0:04:46 loss: 0.7920 (0.7920) time: 4.6216 data: 4.5383 max mem: 26157 Eval (hcp-train-subset): [92] [61/62] eta: 0:00:00 loss: 0.7810 (0.7857) time: 0.1156 data: 0.0331 max mem: 26157 Eval (hcp-train-subset): [92] Total time: 0:00:11 (0.1866 s / it) Averaged stats (hcp-train-subset): loss: 0.7810 (0.7857) Making plots (hcp-train-subset): example=30 Eval (hcp-val): [92] [ 0/62] eta: 0:04:51 loss: 0.8167 (0.8167) time: 4.7032 data: 4.6204 max mem: 26157 Eval (hcp-val): [92] [61/62] eta: 0:00:00 loss: 0.8224 (0.8229) time: 0.0928 data: 0.0105 max mem: 26157 Eval (hcp-val): [92] Total time: 0:00:11 (0.1794 s / it) Averaged stats (hcp-val): loss: 0.8224 (0.8229) Making plots (hcp-val): example=33 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [93] [ 0/6250] eta: 8:15:36 lr: 0.000002 grad: 0.2789 (0.2789) loss: 0.8213 (0.8213) time: 4.7579 data: 4.4867 max mem: 26157 Train: [93] [ 100/6250] eta: 0:31:59 lr: 0.000002 grad: 0.2803 (0.4082) loss: 0.7969 (0.7862) time: 0.2676 data: 0.0002 max mem: 26157 Train: [93] [ 200/6250] eta: 0:29:12 lr: 0.000002 grad: 0.2351 (0.3594) loss: 0.7835 (0.7850) time: 0.2665 data: 0.0001 max mem: 26157 Train: [93] [ 300/6250] eta: 0:27:59 lr: 0.000002 grad: 0.2300 (0.3674) loss: 0.7884 (0.7840) time: 0.2687 data: 0.0002 max mem: 26157 Train: [93] [ 400/6250] eta: 0:27:09 lr: 0.000002 grad: 0.2305 (0.3659) loss: 0.7753 (0.7843) time: 0.2675 data: 0.0002 max mem: 26157 Train: [93] [ 500/6250] eta: 0:26:29 lr: 0.000002 grad: 0.2450 (0.3847) loss: 0.7820 (0.7845) time: 0.2674 data: 0.0001 max mem: 26157 Train: [93] [ 600/6250] eta: 0:25:53 lr: 0.000002 grad: 0.2045 (0.3644) loss: 0.7899 (0.7845) time: 0.2667 data: 0.0001 max mem: 26157 Train: [93] [ 700/6250] eta: 0:25:19 lr: 0.000002 grad: 0.2096 (0.3902) loss: 0.7929 (0.7846) time: 0.2672 data: 0.0001 max mem: 26157 Train: [93] [ 800/6250] eta: 0:24:47 lr: 0.000002 grad: 0.2066 (0.3840) loss: 0.7851 (0.7844) time: 0.2668 data: 0.0001 max mem: 26157 Train: [93] [ 900/6250] eta: 0:24:16 lr: 0.000002 grad: 0.2037 (0.3718) loss: 0.7866 (0.7845) time: 0.2666 data: 0.0001 max mem: 26157 Train: [93] [1000/6250] eta: 0:23:46 lr: 0.000002 grad: 0.2141 (0.3766) loss: 0.7879 (0.7845) time: 0.2673 data: 0.0001 max mem: 26157 Train: [93] [1100/6250] eta: 0:23:17 lr: 0.000002 grad: 0.2097 (0.3888) loss: 0.7866 (0.7843) time: 0.2663 data: 0.0001 max mem: 26157 Train: [93] [1200/6250] eta: 0:22:48 lr: 0.000002 grad: 0.2106 (0.3815) loss: 0.7803 (0.7841) time: 0.2660 data: 0.0001 max mem: 26157 Train: [93] [1300/6250] eta: 0:22:19 lr: 0.000002 grad: 0.2122 (0.3785) loss: 0.7864 (0.7841) time: 0.2674 data: 0.0001 max mem: 26157 Train: [93] [1400/6250] eta: 0:21:51 lr: 0.000002 grad: 0.2274 (0.3865) loss: 0.7866 (0.7840) time: 0.2672 data: 0.0001 max mem: 26157 Train: [93] [1500/6250] eta: 0:21:22 lr: 0.000002 grad: 0.1934 (0.4013) loss: 0.7786 (0.7841) time: 0.2663 data: 0.0001 max mem: 26157 Train: [93] [1600/6250] eta: 0:20:55 lr: 0.000002 grad: 0.2106 (0.3946) loss: 0.7853 (0.7841) time: 0.2673 data: 0.0001 max mem: 26157 Train: [93] [1700/6250] eta: 0:20:27 lr: 0.000002 grad: 0.2073 (0.3868) loss: 0.7846 (0.7841) time: 0.2677 data: 0.0002 max mem: 26157 Train: [93] [1800/6250] eta: 0:19:59 lr: 0.000002 grad: 0.2104 (0.3818) loss: 0.7789 (0.7841) time: 0.2670 data: 0.0002 max mem: 26157 Train: [93] [1900/6250] eta: 0:19:32 lr: 0.000002 grad: 0.2235 (0.3791) loss: 0.7805 (0.7840) time: 0.2665 data: 0.0002 max mem: 26157 Train: [93] [2000/6250] eta: 0:19:04 lr: 0.000002 grad: 0.2082 (0.3776) loss: 0.7868 (0.7840) time: 0.2671 data: 0.0001 max mem: 26157 Train: [93] [2100/6250] eta: 0:18:37 lr: 0.000002 grad: 0.2169 (0.3748) loss: 0.7819 (0.7837) time: 0.2664 data: 0.0001 max mem: 26157 Train: [93] [2200/6250] eta: 0:18:09 lr: 0.000002 grad: 0.2060 (0.3750) loss: 0.7774 (0.7837) time: 0.2667 data: 0.0001 max mem: 26157 Train: [93] [2300/6250] eta: 0:17:42 lr: 0.000001 grad: 0.2071 (0.3742) loss: 0.7829 (0.7836) time: 0.2669 data: 0.0001 max mem: 26157 Train: [93] [2400/6250] eta: 0:17:15 lr: 0.000001 grad: 0.2292 (0.3717) loss: 0.7796 (0.7835) time: 0.2672 data: 0.0001 max mem: 26157 Train: [93] [2500/6250] eta: 0:16:48 lr: 0.000001 grad: 0.2155 (0.3715) loss: 0.7859 (0.7836) time: 0.2668 data: 0.0002 max mem: 26157 Train: [93] [2600/6250] eta: 0:16:21 lr: 0.000001 grad: 0.1981 (0.3698) loss: 0.7842 (0.7835) time: 0.2666 data: 0.0001 max mem: 26157 Train: [93] [2700/6250] eta: 0:15:53 lr: 0.000001 grad: 0.2026 (0.3657) loss: 0.7775 (0.7834) time: 0.2669 data: 0.0001 max mem: 26157 Train: [93] [2800/6250] eta: 0:15:26 lr: 0.000001 grad: 0.2125 (0.3698) loss: 0.7823 (0.7833) time: 0.2665 data: 0.0001 max mem: 26157 Train: [93] [2900/6250] eta: 0:14:59 lr: 0.000001 grad: 0.2156 (0.3677) loss: 0.7769 (0.7831) time: 0.2671 data: 0.0001 max mem: 26157 Train: [93] [3000/6250] eta: 0:14:32 lr: 0.000001 grad: 0.2159 (0.3656) loss: 0.7895 (0.7831) time: 0.2665 data: 0.0001 max mem: 26157 Train: [93] [3100/6250] eta: 0:14:05 lr: 0.000001 grad: 0.2264 (0.3719) loss: 0.7782 (0.7829) time: 0.2670 data: 0.0002 max mem: 26157 Train: [93] [3200/6250] eta: 0:13:38 lr: 0.000001 grad: 0.2092 (0.3723) loss: 0.7806 (0.7828) time: 0.2664 data: 0.0001 max mem: 26157 Train: [93] [3300/6250] eta: 0:13:11 lr: 0.000001 grad: 0.2188 (0.3696) loss: 0.7773 (0.7828) time: 0.2669 data: 0.0001 max mem: 26157 Train: [93] [3400/6250] eta: 0:12:44 lr: 0.000001 grad: 0.2381 (0.3740) loss: 0.7813 (0.7827) time: 0.2670 data: 0.0001 max mem: 26157 Train: [93] [3500/6250] eta: 0:12:17 lr: 0.000001 grad: 0.2200 (0.3711) loss: 0.7751 (0.7826) time: 0.2682 data: 0.0002 max mem: 26157 Train: [93] [3600/6250] eta: 0:11:50 lr: 0.000001 grad: 0.2126 (0.3714) loss: 0.7790 (0.7825) time: 0.2664 data: 0.0001 max mem: 26157 Train: [93] [3700/6250] eta: 0:11:23 lr: 0.000001 grad: 0.2067 (0.3737) loss: 0.7838 (0.7825) time: 0.2664 data: 0.0001 max mem: 26157 Train: [93] [3800/6250] eta: 0:10:57 lr: 0.000001 grad: 0.2212 (0.3779) loss: 0.7741 (0.7823) time: 0.2677 data: 0.0001 max mem: 26157 Train: [93] [3900/6250] eta: 0:10:30 lr: 0.000001 grad: 0.2234 (0.3808) loss: 0.7697 (0.7822) time: 0.2666 data: 0.0002 max mem: 26157 Train: [93] [4000/6250] eta: 0:10:03 lr: 0.000001 grad: 0.2278 (0.3822) loss: 0.7765 (0.7821) time: 0.2667 data: 0.0001 max mem: 26157 Train: [93] [4100/6250] eta: 0:09:36 lr: 0.000001 grad: 0.2038 (0.3849) loss: 0.7854 (0.7821) time: 0.2667 data: 0.0001 max mem: 26157 Train: [93] [4200/6250] eta: 0:09:09 lr: 0.000001 grad: 0.2143 (0.3852) loss: 0.7859 (0.7821) time: 0.2671 data: 0.0001 max mem: 26157 Train: [93] [4300/6250] eta: 0:08:42 lr: 0.000001 grad: 0.2211 (0.3835) loss: 0.7817 (0.7821) time: 0.2672 data: 0.0001 max mem: 26157 Train: [93] [4400/6250] eta: 0:08:15 lr: 0.000001 grad: 0.2177 (0.3897) loss: 0.7841 (0.7821) time: 0.2664 data: 0.0001 max mem: 26157 Train: [93] [4500/6250] eta: 0:07:48 lr: 0.000001 grad: 0.2195 (0.3875) loss: 0.7820 (0.7821) time: 0.2668 data: 0.0001 max mem: 26157 Train: [93] [4600/6250] eta: 0:07:22 lr: 0.000001 grad: 0.2151 (0.3981) loss: 0.7899 (0.7822) time: 0.2665 data: 0.0001 max mem: 26157 Train: [93] [4700/6250] eta: 0:06:55 lr: 0.000001 grad: 0.2324 (0.3989) loss: 0.7836 (0.7822) time: 0.2671 data: 0.0001 max mem: 26157 Train: [93] [4800/6250] eta: 0:06:28 lr: 0.000001 grad: 0.2109 (0.4035) loss: 0.7808 (0.7823) time: 0.2660 data: 0.0001 max mem: 26157 Train: [93] [4900/6250] eta: 0:06:01 lr: 0.000001 grad: 0.2040 (0.4005) loss: 0.7857 (0.7823) time: 0.2661 data: 0.0001 max mem: 26157 Train: [93] [5000/6250] eta: 0:05:34 lr: 0.000001 grad: 0.2291 (0.3978) loss: 0.7809 (0.7824) time: 0.2674 data: 0.0001 max mem: 26157 Train: [93] [5100/6250] eta: 0:05:07 lr: 0.000001 grad: 0.2137 (0.3964) loss: 0.7876 (0.7824) time: 0.2661 data: 0.0001 max mem: 26157 Train: [93] [5200/6250] eta: 0:04:41 lr: 0.000001 grad: 0.2043 (0.3951) loss: 0.7791 (0.7824) time: 0.2678 data: 0.0001 max mem: 26157 Train: [93] [5300/6250] eta: 0:04:14 lr: 0.000001 grad: 0.2185 (0.3976) loss: 0.7694 (0.7824) time: 0.2668 data: 0.0001 max mem: 26157 Train: [93] [5400/6250] eta: 0:03:47 lr: 0.000001 grad: 0.2082 (0.3991) loss: 0.7849 (0.7824) time: 0.2671 data: 0.0001 max mem: 26157 Train: [93] [5500/6250] eta: 0:03:20 lr: 0.000001 grad: 0.2233 (0.3999) loss: 0.7765 (0.7824) time: 0.2669 data: 0.0001 max mem: 26157 Train: [93] [5600/6250] eta: 0:02:54 lr: 0.000001 grad: 0.2006 (0.4020) loss: 0.7846 (0.7824) time: 0.2663 data: 0.0001 max mem: 26157 Train: [93] [5700/6250] eta: 0:02:27 lr: 0.000001 grad: 0.2190 (0.4009) loss: 0.7807 (0.7824) time: 0.2668 data: 0.0001 max mem: 26157 Train: [93] [5800/6250] eta: 0:02:00 lr: 0.000001 grad: 0.2156 (0.3998) loss: 0.7828 (0.7825) time: 0.2671 data: 0.0001 max mem: 26157 Train: [93] [5900/6250] eta: 0:01:33 lr: 0.000001 grad: 0.2170 (0.3990) loss: 0.7880 (0.7824) time: 0.2679 data: 0.0001 max mem: 26157 Train: [93] [6000/6250] eta: 0:01:06 lr: 0.000001 grad: 0.1997 (0.3992) loss: 0.7887 (0.7825) time: 0.2668 data: 0.0001 max mem: 26157 Train: [93] [6100/6250] eta: 0:00:40 lr: 0.000001 grad: 0.2133 (0.3998) loss: 0.7837 (0.7825) time: 0.2667 data: 0.0001 max mem: 26157 Train: [93] [6200/6250] eta: 0:00:13 lr: 0.000001 grad: 0.2205 (0.3998) loss: 0.7888 (0.7826) time: 0.2663 data: 0.0001 max mem: 26157 Train: [93] [6249/6250] eta: 0:00:00 lr: 0.000001 grad: 0.2110 (0.4008) loss: 0.7904 (0.7826) time: 0.2669 data: 0.0001 max mem: 26157 Train: [93] Total time: 0:27:55 (0.2682 s / it) Averaged stats: lr: 0.000001 grad: 0.2110 (0.4008) loss: 0.7904 (0.7826) Eval (hcp-train-subset): [93] [ 0/62] eta: 0:04:43 loss: 0.7886 (0.7886) time: 4.5793 data: 4.4974 max mem: 26157 Eval (hcp-train-subset): [93] [61/62] eta: 0:00:00 loss: 0.7822 (0.7857) time: 0.0940 data: 0.0115 max mem: 26157 Eval (hcp-train-subset): [93] Total time: 0:00:11 (0.1825 s / it) Averaged stats (hcp-train-subset): loss: 0.7822 (0.7857) Making plots (hcp-train-subset): example=42 Eval (hcp-val): [93] [ 0/62] eta: 0:04:18 loss: 0.8168 (0.8168) time: 4.1761 data: 4.0930 max mem: 26157 Eval (hcp-val): [93] [61/62] eta: 0:00:00 loss: 0.8212 (0.8224) time: 0.0927 data: 0.0089 max mem: 26157 Eval (hcp-val): [93] Total time: 0:00:10 (0.1728 s / it) Averaged stats (hcp-val): loss: 0.8212 (0.8224) Making plots (hcp-val): example=62 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [94] [ 0/6250] eta: 8:13:04 lr: 0.000001 grad: 0.2553 (0.2553) loss: 0.8527 (0.8527) time: 4.7336 data: 4.4623 max mem: 26157 Train: [94] [ 100/6250] eta: 0:32:00 lr: 0.000001 grad: 0.2942 (0.7642) loss: 0.7746 (0.7850) time: 0.2686 data: 0.0001 max mem: 26157 Train: [94] [ 200/6250] eta: 0:29:14 lr: 0.000001 grad: 0.2428 (0.5738) loss: 0.7677 (0.7823) time: 0.2672 data: 0.0001 max mem: 26157 Train: [94] [ 300/6250] eta: 0:28:00 lr: 0.000001 grad: 0.2611 (0.5599) loss: 0.7799 (0.7810) time: 0.2667 data: 0.0001 max mem: 26157 Train: [94] [ 400/6250] eta: 0:27:10 lr: 0.000001 grad: 0.2303 (0.5422) loss: 0.7803 (0.7802) time: 0.2674 data: 0.0001 max mem: 26157 Train: [94] [ 500/6250] eta: 0:26:29 lr: 0.000001 grad: 0.2428 (0.5086) loss: 0.7837 (0.7803) time: 0.2683 data: 0.0001 max mem: 26157 Train: [94] [ 600/6250] eta: 0:25:53 lr: 0.000001 grad: 0.2389 (0.4812) loss: 0.7827 (0.7808) time: 0.2668 data: 0.0001 max mem: 26157 Train: [94] [ 700/6250] eta: 0:25:19 lr: 0.000001 grad: 0.2241 (0.4695) loss: 0.7883 (0.7815) time: 0.2664 data: 0.0001 max mem: 26157 Train: [94] [ 800/6250] eta: 0:24:47 lr: 0.000001 grad: 0.2180 (0.4487) loss: 0.7836 (0.7819) time: 0.2670 data: 0.0001 max mem: 26157 Train: [94] [ 900/6250] eta: 0:24:17 lr: 0.000001 grad: 0.2198 (0.4435) loss: 0.7858 (0.7824) time: 0.2681 data: 0.0002 max mem: 26157 Train: [94] [1000/6250] eta: 0:23:47 lr: 0.000001 grad: 0.2181 (0.4400) loss: 0.7858 (0.7824) time: 0.2681 data: 0.0001 max mem: 26157 Train: [94] [1100/6250] eta: 0:23:17 lr: 0.000001 grad: 0.2224 (0.4253) loss: 0.7836 (0.7825) time: 0.2669 data: 0.0001 max mem: 26157 Train: [94] [1200/6250] eta: 0:22:48 lr: 0.000001 grad: 0.2121 (0.4181) loss: 0.7827 (0.7827) time: 0.2667 data: 0.0001 max mem: 26157 Train: [94] [1300/6250] eta: 0:22:20 lr: 0.000001 grad: 0.2245 (0.4282) loss: 0.7850 (0.7827) time: 0.2664 data: 0.0001 max mem: 26157 Train: [94] [1400/6250] eta: 0:21:51 lr: 0.000001 grad: 0.2259 (0.4263) loss: 0.7837 (0.7829) time: 0.2669 data: 0.0001 max mem: 26157 Train: [94] [1500/6250] eta: 0:21:23 lr: 0.000001 grad: 0.2127 (0.4454) loss: 0.7892 (0.7831) time: 0.2680 data: 0.0001 max mem: 26157 Train: [94] [1600/6250] eta: 0:20:55 lr: 0.000001 grad: 0.2056 (0.4397) loss: 0.7819 (0.7833) time: 0.2667 data: 0.0002 max mem: 26157 Train: [94] [1700/6250] eta: 0:20:27 lr: 0.000001 grad: 0.2326 (0.4292) loss: 0.7844 (0.7834) time: 0.2677 data: 0.0001 max mem: 26157 Train: [94] [1800/6250] eta: 0:20:00 lr: 0.000001 grad: 0.2110 (0.4291) loss: 0.7808 (0.7834) time: 0.2666 data: 0.0001 max mem: 26157 Train: [94] [1900/6250] eta: 0:19:32 lr: 0.000001 grad: 0.2247 (0.4257) loss: 0.7795 (0.7835) time: 0.2670 data: 0.0001 max mem: 26157 Train: [94] [2000/6250] eta: 0:19:05 lr: 0.000001 grad: 0.2303 (0.4229) loss: 0.7854 (0.7836) time: 0.2677 data: 0.0001 max mem: 26157 Train: [94] [2100/6250] eta: 0:18:37 lr: 0.000001 grad: 0.2263 (0.4204) loss: 0.7895 (0.7836) time: 0.2672 data: 0.0001 max mem: 26157 Train: [94] [2200/6250] eta: 0:18:10 lr: 0.000001 grad: 0.2210 (0.4192) loss: 0.7845 (0.7837) time: 0.2673 data: 0.0001 max mem: 26157 Train: [94] [2300/6250] eta: 0:17:43 lr: 0.000001 grad: 0.2412 (0.4155) loss: 0.7811 (0.7837) time: 0.2665 data: 0.0001 max mem: 26157 Train: [94] [2400/6250] eta: 0:17:15 lr: 0.000001 grad: 0.2420 (0.4134) loss: 0.7787 (0.7837) time: 0.2674 data: 0.0002 max mem: 26157 Train: [94] [2500/6250] eta: 0:16:48 lr: 0.000001 grad: 0.2218 (0.4187) loss: 0.7848 (0.7836) time: 0.2680 data: 0.0002 max mem: 26157 Train: [94] [2600/6250] eta: 0:16:21 lr: 0.000001 grad: 0.2401 (0.4140) loss: 0.7802 (0.7834) time: 0.2669 data: 0.0002 max mem: 26157 Train: [94] [2700/6250] eta: 0:15:54 lr: 0.000001 grad: 0.2129 (0.4113) loss: 0.7862 (0.7833) time: 0.2665 data: 0.0001 max mem: 26157 Train: [94] [2800/6250] eta: 0:15:27 lr: 0.000001 grad: 0.2364 (0.4153) loss: 0.7791 (0.7833) time: 0.2672 data: 0.0001 max mem: 26157 Train: [94] [2900/6250] eta: 0:15:00 lr: 0.000001 grad: 0.2255 (0.4144) loss: 0.7849 (0.7833) time: 0.2667 data: 0.0001 max mem: 26157 Train: [94] [3000/6250] eta: 0:14:33 lr: 0.000001 grad: 0.2246 (0.4127) loss: 0.7789 (0.7832) time: 0.2667 data: 0.0001 max mem: 26157 Train: [94] [3100/6250] eta: 0:14:06 lr: 0.000001 grad: 0.2193 (0.4152) loss: 0.7757 (0.7831) time: 0.2669 data: 0.0001 max mem: 26157 Train: [94] [3200/6250] eta: 0:13:39 lr: 0.000001 grad: 0.2154 (0.4248) loss: 0.7801 (0.7830) time: 0.2667 data: 0.0001 max mem: 26157 Train: [94] [3300/6250] eta: 0:13:12 lr: 0.000001 grad: 0.2093 (0.4217) loss: 0.7869 (0.7829) time: 0.2667 data: 0.0001 max mem: 26157 Train: [94] [3400/6250] eta: 0:12:45 lr: 0.000001 grad: 0.2094 (0.4203) loss: 0.7873 (0.7828) time: 0.2665 data: 0.0001 max mem: 26157 Train: [94] [3500/6250] eta: 0:12:18 lr: 0.000001 grad: 0.2330 (0.4274) loss: 0.7766 (0.7827) time: 0.2671 data: 0.0001 max mem: 26157 Train: [94] [3600/6250] eta: 0:11:51 lr: 0.000001 grad: 0.2158 (0.4230) loss: 0.7766 (0.7825) time: 0.2674 data: 0.0001 max mem: 26157 Train: [94] [3700/6250] eta: 0:11:24 lr: 0.000001 grad: 0.2158 (0.4235) loss: 0.7792 (0.7824) time: 0.2669 data: 0.0001 max mem: 26157 Train: [94] [3800/6250] eta: 0:10:57 lr: 0.000001 grad: 0.2226 (0.4259) loss: 0.7721 (0.7822) time: 0.2666 data: 0.0001 max mem: 26157 Train: [94] [3900/6250] eta: 0:10:30 lr: 0.000001 grad: 0.2136 (0.4318) loss: 0.7847 (0.7821) time: 0.2682 data: 0.0001 max mem: 26157 Train: [94] [4000/6250] eta: 0:10:03 lr: 0.000001 grad: 0.2177 (0.4303) loss: 0.7837 (0.7820) time: 0.2666 data: 0.0001 max mem: 26157 Train: [94] [4100/6250] eta: 0:09:36 lr: 0.000001 grad: 0.2116 (0.4274) loss: 0.7829 (0.7820) time: 0.2665 data: 0.0001 max mem: 26157 Train: [94] [4200/6250] eta: 0:09:09 lr: 0.000001 grad: 0.2124 (0.4243) loss: 0.7776 (0.7819) time: 0.2668 data: 0.0001 max mem: 26157 Train: [94] [4300/6250] eta: 0:08:42 lr: 0.000001 grad: 0.2131 (0.4200) loss: 0.7787 (0.7819) time: 0.2662 data: 0.0001 max mem: 26157 Train: [94] [4400/6250] eta: 0:08:15 lr: 0.000001 grad: 0.2081 (0.4191) loss: 0.7844 (0.7819) time: 0.2666 data: 0.0001 max mem: 26157 Train: [94] [4500/6250] eta: 0:07:49 lr: 0.000001 grad: 0.2198 (0.4165) loss: 0.7880 (0.7819) time: 0.2670 data: 0.0001 max mem: 26157 Train: [94] [4600/6250] eta: 0:07:22 lr: 0.000001 grad: 0.2064 (0.4147) loss: 0.7855 (0.7820) time: 0.2666 data: 0.0002 max mem: 26157 Train: [94] [4700/6250] eta: 0:06:55 lr: 0.000001 grad: 0.2114 (0.4171) loss: 0.7922 (0.7821) time: 0.2671 data: 0.0001 max mem: 26157 Train: [94] [4800/6250] eta: 0:06:28 lr: 0.000001 grad: 0.2142 (0.4141) loss: 0.7886 (0.7822) time: 0.2675 data: 0.0001 max mem: 26157 Train: [94] [4900/6250] eta: 0:06:01 lr: 0.000001 grad: 0.2154 (0.4120) loss: 0.7817 (0.7823) time: 0.2663 data: 0.0001 max mem: 26157 Train: [94] [5000/6250] eta: 0:05:34 lr: 0.000001 grad: 0.2122 (0.4118) loss: 0.7953 (0.7824) time: 0.2665 data: 0.0001 max mem: 26157 Train: [94] [5100/6250] eta: 0:05:08 lr: 0.000001 grad: 0.2114 (0.4117) loss: 0.7852 (0.7824) time: 0.2663 data: 0.0001 max mem: 26157 Train: [94] [5200/6250] eta: 0:04:41 lr: 0.000001 grad: 0.2029 (0.4125) loss: 0.7883 (0.7825) time: 0.2670 data: 0.0001 max mem: 26157 Train: [94] [5300/6250] eta: 0:04:14 lr: 0.000001 grad: 0.2168 (0.4175) loss: 0.7867 (0.7825) time: 0.2667 data: 0.0001 max mem: 26157 Train: [94] [5400/6250] eta: 0:03:47 lr: 0.000001 grad: 0.2240 (0.4155) loss: 0.7782 (0.7826) time: 0.2669 data: 0.0001 max mem: 26157 Train: [94] [5500/6250] eta: 0:03:20 lr: 0.000001 grad: 0.2180 (0.4130) loss: 0.7830 (0.7826) time: 0.2665 data: 0.0002 max mem: 26157 Train: [94] [5600/6250] eta: 0:02:54 lr: 0.000001 grad: 0.2078 (0.4138) loss: 0.7848 (0.7827) time: 0.2662 data: 0.0001 max mem: 26157 Train: [94] [5700/6250] eta: 0:02:27 lr: 0.000001 grad: 0.2266 (0.4141) loss: 0.7840 (0.7827) time: 0.2655 data: 0.0001 max mem: 26157 Train: [94] [5800/6250] eta: 0:02:00 lr: 0.000001 grad: 0.2293 (0.4131) loss: 0.7795 (0.7827) time: 0.2662 data: 0.0001 max mem: 26157 Train: [94] [5900/6250] eta: 0:01:33 lr: 0.000001 grad: 0.2135 (0.4182) loss: 0.7784 (0.7827) time: 0.2677 data: 0.0002 max mem: 26157 Train: [94] [6000/6250] eta: 0:01:06 lr: 0.000001 grad: 0.2159 (0.4158) loss: 0.7795 (0.7827) time: 0.2666 data: 0.0001 max mem: 26157 Train: [94] [6100/6250] eta: 0:00:40 lr: 0.000001 grad: 0.2192 (0.4186) loss: 0.7871 (0.7827) time: 0.2666 data: 0.0001 max mem: 26157 Train: [94] [6200/6250] eta: 0:00:13 lr: 0.000001 grad: 0.2242 (0.4159) loss: 0.7837 (0.7828) time: 0.2671 data: 0.0001 max mem: 26157 Train: [94] [6249/6250] eta: 0:00:00 lr: 0.000001 grad: 0.2154 (0.4155) loss: 0.7906 (0.7828) time: 0.2668 data: 0.0001 max mem: 26157 Train: [94] Total time: 0:27:56 (0.2683 s / it) Averaged stats: lr: 0.000001 grad: 0.2154 (0.4155) loss: 0.7906 (0.7828) Eval (hcp-train-subset): [94] [ 0/62] eta: 0:03:59 loss: 0.7858 (0.7858) time: 3.8585 data: 3.7755 max mem: 26157 Eval (hcp-train-subset): [94] [61/62] eta: 0:00:00 loss: 0.7808 (0.7849) time: 0.0824 data: 0.0001 max mem: 26157 Eval (hcp-train-subset): [94] Total time: 0:00:10 (0.1660 s / it) Averaged stats (hcp-train-subset): loss: 0.7808 (0.7849) Making plots (hcp-train-subset): example=39 Eval (hcp-val): [94] [ 0/62] eta: 0:04:15 loss: 0.8202 (0.8202) time: 4.1283 data: 4.0455 max mem: 26157 Eval (hcp-val): [94] [61/62] eta: 0:00:00 loss: 0.8213 (0.8221) time: 0.0883 data: 0.0059 max mem: 26157 Eval (hcp-val): [94] Total time: 0:00:10 (0.1684 s / it) Averaged stats (hcp-val): loss: 0.8213 (0.8221) Making plots (hcp-val): example=15 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [95] [ 0/6250] eta: 9:14:53 lr: 0.000001 grad: 0.1914 (0.1914) loss: 0.8216 (0.8216) time: 5.3270 data: 5.0537 max mem: 26157 Train: [95] [ 100/6250] eta: 0:32:33 lr: 0.000001 grad: 0.2586 (0.3818) loss: 0.7904 (0.7936) time: 0.2677 data: 0.0001 max mem: 26157 Train: [95] [ 200/6250] eta: 0:29:27 lr: 0.000001 grad: 0.2687 (0.4052) loss: 0.7936 (0.7895) time: 0.2661 data: 0.0001 max mem: 26157 Train: [95] [ 300/6250] eta: 0:28:08 lr: 0.000001 grad: 0.2839 (0.3858) loss: 0.7835 (0.7856) time: 0.2680 data: 0.0001 max mem: 26157 Train: [95] [ 400/6250] eta: 0:27:16 lr: 0.000001 grad: 0.2458 (0.3742) loss: 0.7851 (0.7836) time: 0.2669 data: 0.0002 max mem: 26157 Train: [95] [ 500/6250] eta: 0:26:34 lr: 0.000001 grad: 0.2377 (0.4589) loss: 0.7844 (0.7828) time: 0.2664 data: 0.0001 max mem: 26157 Train: [95] [ 600/6250] eta: 0:25:56 lr: 0.000001 grad: 0.2304 (0.4360) loss: 0.7794 (0.7824) time: 0.2669 data: 0.0001 max mem: 26157 Train: [95] [ 700/6250] eta: 0:25:23 lr: 0.000001 grad: 0.2277 (0.4440) loss: 0.7825 (0.7821) time: 0.2666 data: 0.0001 max mem: 26157 Train: [95] [ 800/6250] eta: 0:24:50 lr: 0.000001 grad: 0.2374 (0.4319) loss: 0.7785 (0.7822) time: 0.2668 data: 0.0001 max mem: 26157 Train: [95] [ 900/6250] eta: 0:24:19 lr: 0.000001 grad: 0.2184 (0.4269) loss: 0.7820 (0.7820) time: 0.2674 data: 0.0002 max mem: 26157 Train: [95] [1000/6250] eta: 0:23:49 lr: 0.000001 grad: 0.2067 (0.4308) loss: 0.7789 (0.7821) time: 0.2671 data: 0.0001 max mem: 26157 Train: [95] [1100/6250] eta: 0:23:19 lr: 0.000001 grad: 0.2272 (0.4212) loss: 0.7895 (0.7825) time: 0.2677 data: 0.0002 max mem: 26157 Train: [95] [1200/6250] eta: 0:22:50 lr: 0.000001 grad: 0.2483 (0.4179) loss: 0.7812 (0.7825) time: 0.2669 data: 0.0001 max mem: 26157 Train: [95] [1300/6250] eta: 0:22:22 lr: 0.000001 grad: 0.2098 (0.4153) loss: 0.7758 (0.7827) time: 0.2669 data: 0.0002 max mem: 26157 Train: [95] [1400/6250] eta: 0:21:53 lr: 0.000001 grad: 0.2248 (0.4065) loss: 0.7774 (0.7826) time: 0.2666 data: 0.0002 max mem: 26157 Train: [95] [1500/6250] eta: 0:21:25 lr: 0.000001 grad: 0.2090 (0.4033) loss: 0.7779 (0.7824) time: 0.2674 data: 0.0001 max mem: 26157 Train: [95] [1600/6250] eta: 0:20:56 lr: 0.000001 grad: 0.2247 (0.4034) loss: 0.7796 (0.7823) time: 0.2667 data: 0.0001 max mem: 26157 Train: [95] [1700/6250] eta: 0:20:29 lr: 0.000001 grad: 0.2214 (0.3981) loss: 0.7770 (0.7820) time: 0.2673 data: 0.0002 max mem: 26157 Train: [95] [1800/6250] eta: 0:20:01 lr: 0.000001 grad: 0.2220 (0.3912) loss: 0.7820 (0.7818) time: 0.2668 data: 0.0001 max mem: 26157 Train: [95] [1900/6250] eta: 0:19:33 lr: 0.000001 grad: 0.2208 (0.3928) loss: 0.7798 (0.7818) time: 0.2660 data: 0.0002 max mem: 26157 Train: [95] [2000/6250] eta: 0:19:05 lr: 0.000001 grad: 0.2215 (0.3970) loss: 0.7786 (0.7816) time: 0.2671 data: 0.0001 max mem: 26157 Train: [95] [2100/6250] eta: 0:18:38 lr: 0.000001 grad: 0.2121 (0.3947) loss: 0.7847 (0.7815) time: 0.2674 data: 0.0001 max mem: 26157 Train: [95] [2200/6250] eta: 0:18:10 lr: 0.000001 grad: 0.2160 (0.4010) loss: 0.7779 (0.7814) time: 0.2666 data: 0.0001 max mem: 26157 Train: [95] [2300/6250] eta: 0:17:43 lr: 0.000001 grad: 0.2300 (0.3990) loss: 0.7833 (0.7812) time: 0.2684 data: 0.0002 max mem: 26157 Train: [95] [2400/6250] eta: 0:17:16 lr: 0.000001 grad: 0.2093 (0.3985) loss: 0.7806 (0.7812) time: 0.2675 data: 0.0001 max mem: 26157 Train: [95] [2500/6250] eta: 0:16:49 lr: 0.000001 grad: 0.2228 (0.3978) loss: 0.7859 (0.7813) time: 0.2673 data: 0.0001 max mem: 26157 Train: [95] [2600/6250] eta: 0:16:21 lr: 0.000001 grad: 0.2384 (0.4014) loss: 0.7746 (0.7814) time: 0.2671 data: 0.0001 max mem: 26157 Train: [95] [2700/6250] eta: 0:15:55 lr: 0.000001 grad: 0.2192 (0.3979) loss: 0.7819 (0.7814) time: 0.2681 data: 0.0001 max mem: 26157 Train: [95] [2800/6250] eta: 0:15:28 lr: 0.000001 grad: 0.2108 (0.3967) loss: 0.7859 (0.7815) time: 0.2668 data: 0.0001 max mem: 26157 Train: [95] [2900/6250] eta: 0:15:01 lr: 0.000001 grad: 0.2106 (0.3963) loss: 0.7933 (0.7816) time: 0.2669 data: 0.0001 max mem: 26157 Train: [95] [3000/6250] eta: 0:14:34 lr: 0.000001 grad: 0.2115 (0.4006) loss: 0.7716 (0.7816) time: 0.2669 data: 0.0001 max mem: 26157 Train: [95] [3100/6250] eta: 0:14:07 lr: 0.000001 grad: 0.2007 (0.4061) loss: 0.7842 (0.7816) time: 0.2679 data: 0.0002 max mem: 26157 Train: [95] [3200/6250] eta: 0:13:40 lr: 0.000001 grad: 0.2109 (0.4106) loss: 0.7877 (0.7817) time: 0.2676 data: 0.0001 max mem: 26157 Train: [95] [3300/6250] eta: 0:13:13 lr: 0.000001 grad: 0.2310 (0.4062) loss: 0.7847 (0.7817) time: 0.2673 data: 0.0001 max mem: 26157 Train: [95] [3400/6250] eta: 0:12:46 lr: 0.000001 grad: 0.2292 (0.4061) loss: 0.7726 (0.7817) time: 0.2666 data: 0.0001 max mem: 26157 Train: [95] [3500/6250] eta: 0:12:19 lr: 0.000001 grad: 0.2200 (0.4042) loss: 0.7816 (0.7817) time: 0.2663 data: 0.0001 max mem: 26157 Train: [95] [3600/6250] eta: 0:11:52 lr: 0.000001 grad: 0.2294 (0.4029) loss: 0.7677 (0.7817) time: 0.2673 data: 0.0001 max mem: 26157 Train: [95] [3700/6250] eta: 0:11:25 lr: 0.000001 grad: 0.2193 (0.4011) loss: 0.7920 (0.7818) time: 0.2668 data: 0.0001 max mem: 26157 Train: [95] [3800/6250] eta: 0:10:58 lr: 0.000001 grad: 0.2178 (0.4009) loss: 0.7842 (0.7818) time: 0.2670 data: 0.0001 max mem: 26157 Train: [95] [3900/6250] eta: 0:10:31 lr: 0.000001 grad: 0.2372 (0.3977) loss: 0.7772 (0.7818) time: 0.2666 data: 0.0001 max mem: 26157 Train: [95] [4000/6250] eta: 0:10:04 lr: 0.000001 grad: 0.2357 (0.4010) loss: 0.7757 (0.7818) time: 0.2671 data: 0.0001 max mem: 26157 Train: [95] [4100/6250] eta: 0:09:37 lr: 0.000001 grad: 0.2427 (0.4037) loss: 0.7774 (0.7819) time: 0.2666 data: 0.0001 max mem: 26157 Train: [95] [4200/6250] eta: 0:09:10 lr: 0.000001 grad: 0.2322 (0.4039) loss: 0.7737 (0.7819) time: 0.2677 data: 0.0001 max mem: 26157 Train: [95] [4300/6250] eta: 0:08:43 lr: 0.000001 grad: 0.2125 (0.4068) loss: 0.7832 (0.7819) time: 0.2674 data: 0.0002 max mem: 26157 Train: [95] [4400/6250] eta: 0:08:16 lr: 0.000001 grad: 0.2047 (0.4069) loss: 0.7880 (0.7820) time: 0.2674 data: 0.0001 max mem: 26157 Train: [95] [4500/6250] eta: 0:07:49 lr: 0.000001 grad: 0.2197 (0.4049) loss: 0.7868 (0.7821) time: 0.2669 data: 0.0001 max mem: 26157 Train: [95] [4600/6250] eta: 0:07:22 lr: 0.000001 grad: 0.2145 (0.4036) loss: 0.7873 (0.7822) time: 0.2669 data: 0.0001 max mem: 26157 Train: [95] [4700/6250] eta: 0:06:55 lr: 0.000001 grad: 0.2102 (0.4051) loss: 0.7864 (0.7823) time: 0.2668 data: 0.0001 max mem: 26157 Train: [95] [4800/6250] eta: 0:06:29 lr: 0.000001 grad: 0.1985 (0.4025) loss: 0.7908 (0.7824) time: 0.2675 data: 0.0001 max mem: 26157 Train: [95] [4900/6250] eta: 0:06:02 lr: 0.000001 grad: 0.2058 (0.4046) loss: 0.7898 (0.7825) time: 0.2671 data: 0.0001 max mem: 26157 Train: [95] [5000/6250] eta: 0:05:35 lr: 0.000001 grad: 0.2150 (0.4084) loss: 0.7825 (0.7826) time: 0.2692 data: 0.0001 max mem: 26157 Train: [95] [5100/6250] eta: 0:05:08 lr: 0.000001 grad: 0.2333 (0.4064) loss: 0.7897 (0.7827) time: 0.2668 data: 0.0001 max mem: 26157 Train: [95] [5200/6250] eta: 0:04:41 lr: 0.000001 grad: 0.2278 (0.4057) loss: 0.7907 (0.7828) time: 0.2667 data: 0.0001 max mem: 26157 Train: [95] [5300/6250] eta: 0:04:14 lr: 0.000001 grad: 0.1950 (0.4045) loss: 0.7949 (0.7828) time: 0.2665 data: 0.0001 max mem: 26157 Train: [95] [5400/6250] eta: 0:03:47 lr: 0.000001 grad: 0.2172 (0.4024) loss: 0.7923 (0.7829) time: 0.2677 data: 0.0001 max mem: 26157 Train: [95] [5500/6250] eta: 0:03:21 lr: 0.000001 grad: 0.2081 (0.4041) loss: 0.7885 (0.7831) time: 0.2656 data: 0.0001 max mem: 26157 Train: [95] [5600/6250] eta: 0:02:54 lr: 0.000001 grad: 0.2132 (0.4024) loss: 0.7920 (0.7832) time: 0.2667 data: 0.0001 max mem: 26157 Train: [95] [5700/6250] eta: 0:02:27 lr: 0.000001 grad: 0.2176 (0.4015) loss: 0.7892 (0.7832) time: 0.2677 data: 0.0001 max mem: 26157 Train: [95] [5800/6250] eta: 0:02:00 lr: 0.000001 grad: 0.2150 (0.4004) loss: 0.7876 (0.7833) time: 0.2674 data: 0.0001 max mem: 26157 Train: [95] [5900/6250] eta: 0:01:33 lr: 0.000001 grad: 0.2243 (0.3993) loss: 0.7815 (0.7833) time: 0.2659 data: 0.0001 max mem: 26157 Train: [95] [6000/6250] eta: 0:01:07 lr: 0.000001 grad: 0.2380 (0.4046) loss: 0.7846 (0.7833) time: 0.2667 data: 0.0001 max mem: 26157 Train: [95] [6100/6250] eta: 0:00:40 lr: 0.000001 grad: 0.2123 (0.4103) loss: 0.7851 (0.7834) time: 0.2663 data: 0.0001 max mem: 26157 Train: [95] [6200/6250] eta: 0:00:13 lr: 0.000001 grad: 0.2339 (0.4129) loss: 0.7811 (0.7833) time: 0.2664 data: 0.0001 max mem: 26157 Train: [95] [6249/6250] eta: 0:00:00 lr: 0.000001 grad: 0.2143 (0.4138) loss: 0.7844 (0.7833) time: 0.2669 data: 0.0002 max mem: 26157 Train: [95] Total time: 0:27:58 (0.2685 s / it) Averaged stats: lr: 0.000001 grad: 0.2143 (0.4138) loss: 0.7844 (0.7833) Eval (hcp-train-subset): [95] [ 0/62] eta: 0:05:05 loss: 0.7847 (0.7847) time: 4.9198 data: 4.8369 max mem: 26157 Eval (hcp-train-subset): [95] [61/62] eta: 0:00:00 loss: 0.7800 (0.7849) time: 0.1027 data: 0.0202 max mem: 26157 Eval (hcp-train-subset): [95] Total time: 0:00:11 (0.1883 s / it) Averaged stats (hcp-train-subset): loss: 0.7800 (0.7849) Making plots (hcp-train-subset): example=16 Eval (hcp-val): [95] [ 0/62] eta: 0:03:50 loss: 0.8171 (0.8171) time: 3.7161 data: 3.6109 max mem: 26157 Eval (hcp-val): [95] [61/62] eta: 0:00:00 loss: 0.8218 (0.8218) time: 0.1082 data: 0.0257 max mem: 26157 Eval (hcp-val): [95] Total time: 0:00:11 (0.1854 s / it) Averaged stats (hcp-val): loss: 0.8218 (0.8218) Making plots (hcp-val): example=40 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [96] [ 0/6250] eta: 8:24:52 lr: 0.000001 grad: 0.3420 (0.3420) loss: 0.7692 (0.7692) time: 4.8467 data: 4.5758 max mem: 26157 Train: [96] [ 100/6250] eta: 0:32:02 lr: 0.000001 grad: 0.2700 (0.6761) loss: 0.7954 (0.7945) time: 0.2665 data: 0.0002 max mem: 26157 Train: [96] [ 200/6250] eta: 0:29:12 lr: 0.000001 grad: 0.2459 (0.4901) loss: 0.7910 (0.7959) time: 0.2662 data: 0.0001 max mem: 26157 Train: [96] [ 300/6250] eta: 0:27:58 lr: 0.000001 grad: 0.2594 (0.4154) loss: 0.8029 (0.7970) time: 0.2674 data: 0.0001 max mem: 26157 Train: [96] [ 400/6250] eta: 0:27:09 lr: 0.000001 grad: 0.2429 (0.3975) loss: 0.7974 (0.7971) time: 0.2675 data: 0.0002 max mem: 26157 Train: [96] [ 500/6250] eta: 0:26:28 lr: 0.000001 grad: 0.2304 (0.3853) loss: 0.7925 (0.7964) time: 0.2668 data: 0.0001 max mem: 26157 Train: [96] [ 600/6250] eta: 0:25:52 lr: 0.000001 grad: 0.2320 (0.3827) loss: 0.7797 (0.7944) time: 0.2662 data: 0.0001 max mem: 26157 Train: [96] [ 700/6250] eta: 0:25:18 lr: 0.000001 grad: 0.2234 (0.3700) loss: 0.7873 (0.7931) time: 0.2670 data: 0.0001 max mem: 26157 Train: [96] [ 800/6250] eta: 0:24:47 lr: 0.000001 grad: 0.2302 (0.3754) loss: 0.7791 (0.7920) time: 0.2672 data: 0.0001 max mem: 26157 Train: [96] [ 900/6250] eta: 0:24:16 lr: 0.000001 grad: 0.2381 (0.3979) loss: 0.7796 (0.7909) time: 0.2673 data: 0.0001 max mem: 26157 Train: [96] [1000/6250] eta: 0:23:46 lr: 0.000001 grad: 0.2230 (0.3882) loss: 0.7874 (0.7902) time: 0.2669 data: 0.0001 max mem: 26157 Train: [96] [1100/6250] eta: 0:23:17 lr: 0.000000 grad: 0.2290 (0.4313) loss: 0.7852 (0.7896) time: 0.2682 data: 0.0001 max mem: 26157 Train: [96] [1200/6250] eta: 0:22:48 lr: 0.000000 grad: 0.2231 (0.4237) loss: 0.7792 (0.7891) time: 0.2665 data: 0.0001 max mem: 26157 Train: [96] [1300/6250] eta: 0:22:19 lr: 0.000000 grad: 0.2409 (0.4201) loss: 0.7810 (0.7886) time: 0.2672 data: 0.0001 max mem: 26157 Train: [96] [1400/6250] eta: 0:21:51 lr: 0.000000 grad: 0.2366 (0.4238) loss: 0.7854 (0.7883) time: 0.2680 data: 0.0002 max mem: 26157 Train: [96] [1500/6250] eta: 0:21:23 lr: 0.000000 grad: 0.2287 (0.4191) loss: 0.7727 (0.7878) time: 0.2676 data: 0.0002 max mem: 26157 Train: [96] [1600/6250] eta: 0:20:55 lr: 0.000000 grad: 0.2423 (0.4148) loss: 0.7842 (0.7875) time: 0.2663 data: 0.0002 max mem: 26157 Train: [96] [1700/6250] eta: 0:20:27 lr: 0.000000 grad: 0.2420 (0.4110) loss: 0.7784 (0.7870) time: 0.2667 data: 0.0001 max mem: 26157 Train: [96] [1800/6250] eta: 0:19:59 lr: 0.000000 grad: 0.2384 (0.4107) loss: 0.7815 (0.7866) time: 0.2679 data: 0.0001 max mem: 26157 Train: [96] [1900/6250] eta: 0:19:32 lr: 0.000000 grad: 0.2270 (0.4154) loss: 0.7744 (0.7861) time: 0.2670 data: 0.0001 max mem: 26157 Train: [96] [2000/6250] eta: 0:19:04 lr: 0.000000 grad: 0.2147 (0.4074) loss: 0.7779 (0.7858) time: 0.2683 data: 0.0001 max mem: 26157 Train: [96] [2100/6250] eta: 0:18:37 lr: 0.000000 grad: 0.2276 (0.4206) loss: 0.7805 (0.7854) time: 0.2667 data: 0.0001 max mem: 26157 Train: [96] [2200/6250] eta: 0:18:10 lr: 0.000000 grad: 0.2080 (0.4150) loss: 0.7852 (0.7852) time: 0.2676 data: 0.0001 max mem: 26157 Train: [96] [2300/6250] eta: 0:17:42 lr: 0.000000 grad: 0.2311 (0.4116) loss: 0.7704 (0.7848) time: 0.2669 data: 0.0001 max mem: 26157 Train: [96] [2400/6250] eta: 0:17:15 lr: 0.000000 grad: 0.2313 (0.4144) loss: 0.7785 (0.7847) time: 0.2666 data: 0.0001 max mem: 26157 Train: [96] [2500/6250] eta: 0:16:48 lr: 0.000000 grad: 0.2221 (0.4172) loss: 0.7820 (0.7846) time: 0.2676 data: 0.0002 max mem: 26157 Train: [96] [2600/6250] eta: 0:16:21 lr: 0.000000 grad: 0.2233 (0.4148) loss: 0.7851 (0.7846) time: 0.2670 data: 0.0001 max mem: 26157 Train: [96] [2700/6250] eta: 0:15:53 lr: 0.000000 grad: 0.2351 (0.4119) loss: 0.7767 (0.7845) time: 0.2664 data: 0.0001 max mem: 26157 Train: [96] [2800/6250] eta: 0:15:26 lr: 0.000000 grad: 0.2364 (0.4107) loss: 0.7780 (0.7844) time: 0.2666 data: 0.0001 max mem: 26157 Train: [96] [2900/6250] eta: 0:14:59 lr: 0.000000 grad: 0.2207 (0.4090) loss: 0.7797 (0.7842) time: 0.2684 data: 0.0002 max mem: 26157 Train: [96] [3000/6250] eta: 0:14:32 lr: 0.000000 grad: 0.2234 (0.4052) loss: 0.7789 (0.7840) time: 0.2666 data: 0.0001 max mem: 26157 Train: [96] [3100/6250] eta: 0:14:05 lr: 0.000000 grad: 0.2267 (0.4068) loss: 0.7723 (0.7840) time: 0.2661 data: 0.0001 max mem: 26157 Train: [96] [3200/6250] eta: 0:13:38 lr: 0.000000 grad: 0.2155 (0.4126) loss: 0.7783 (0.7838) time: 0.2666 data: 0.0001 max mem: 26157 Train: [96] [3300/6250] eta: 0:13:11 lr: 0.000000 grad: 0.2101 (0.4093) loss: 0.7766 (0.7837) time: 0.2663 data: 0.0001 max mem: 26157 Train: [96] [3400/6250] eta: 0:12:44 lr: 0.000000 grad: 0.2156 (0.4062) loss: 0.7830 (0.7836) time: 0.2677 data: 0.0001 max mem: 26157 Train: [96] [3500/6250] eta: 0:12:17 lr: 0.000000 grad: 0.2219 (0.4132) loss: 0.7829 (0.7835) time: 0.2669 data: 0.0001 max mem: 26157 Train: [96] [3600/6250] eta: 0:11:50 lr: 0.000000 grad: 0.2226 (0.4105) loss: 0.7809 (0.7834) time: 0.2676 data: 0.0001 max mem: 26157 Train: [96] [3700/6250] eta: 0:11:24 lr: 0.000000 grad: 0.2210 (0.4123) loss: 0.7783 (0.7833) time: 0.2672 data: 0.0001 max mem: 26157 Train: [96] [3800/6250] eta: 0:10:57 lr: 0.000000 grad: 0.2054 (0.4096) loss: 0.7809 (0.7832) time: 0.2684 data: 0.0002 max mem: 26157 Train: [96] [3900/6250] eta: 0:10:30 lr: 0.000000 grad: 0.2191 (0.4054) loss: 0.7673 (0.7831) time: 0.2674 data: 0.0001 max mem: 26157 Train: [96] [4000/6250] eta: 0:10:03 lr: 0.000000 grad: 0.2309 (0.4046) loss: 0.7816 (0.7830) time: 0.2665 data: 0.0001 max mem: 26157 Train: [96] [4100/6250] eta: 0:09:36 lr: 0.000000 grad: 0.2341 (0.4008) loss: 0.7808 (0.7829) time: 0.2664 data: 0.0001 max mem: 26157 Train: [96] [4200/6250] eta: 0:09:09 lr: 0.000000 grad: 0.2170 (0.3988) loss: 0.7835 (0.7828) time: 0.2659 data: 0.0001 max mem: 26157 Train: [96] [4300/6250] eta: 0:08:42 lr: 0.000000 grad: 0.2273 (0.3970) loss: 0.7737 (0.7827) time: 0.2693 data: 0.0002 max mem: 26157 Train: [96] [4400/6250] eta: 0:08:15 lr: 0.000000 grad: 0.2269 (0.4009) loss: 0.7812 (0.7826) time: 0.2664 data: 0.0001 max mem: 26157 Train: [96] [4500/6250] eta: 0:07:49 lr: 0.000000 grad: 0.2445 (0.4026) loss: 0.7739 (0.7825) time: 0.2668 data: 0.0001 max mem: 26157 Train: [96] [4600/6250] eta: 0:07:22 lr: 0.000000 grad: 0.2410 (0.4039) loss: 0.7807 (0.7824) time: 0.2669 data: 0.0001 max mem: 26157 Train: [96] [4700/6250] eta: 0:06:55 lr: 0.000000 grad: 0.2584 (0.4026) loss: 0.7811 (0.7824) time: 0.2671 data: 0.0001 max mem: 26157 Train: [96] [4800/6250] eta: 0:06:28 lr: 0.000000 grad: 0.2243 (0.4007) loss: 0.7745 (0.7823) time: 0.2668 data: 0.0001 max mem: 26157 Train: [96] [4900/6250] eta: 0:06:01 lr: 0.000000 grad: 0.2505 (0.4044) loss: 0.7730 (0.7822) time: 0.2669 data: 0.0001 max mem: 26157 Train: [96] [5000/6250] eta: 0:05:34 lr: 0.000000 grad: 0.2566 (0.4062) loss: 0.7830 (0.7822) time: 0.2668 data: 0.0001 max mem: 26157 Train: [96] [5100/6250] eta: 0:05:08 lr: 0.000000 grad: 0.2395 (0.4051) loss: 0.7789 (0.7821) time: 0.2684 data: 0.0002 max mem: 26157 Train: [96] [5200/6250] eta: 0:04:41 lr: 0.000000 grad: 0.2330 (0.4032) loss: 0.7898 (0.7821) time: 0.2666 data: 0.0002 max mem: 26157 Train: [96] [5300/6250] eta: 0:04:14 lr: 0.000000 grad: 0.2346 (0.4048) loss: 0.7834 (0.7821) time: 0.2684 data: 0.0001 max mem: 26157 Train: [96] [5400/6250] eta: 0:03:47 lr: 0.000000 grad: 0.2304 (0.4048) loss: 0.7753 (0.7820) time: 0.2666 data: 0.0001 max mem: 26157 Train: [96] [5500/6250] eta: 0:03:20 lr: 0.000000 grad: 0.2307 (0.4046) loss: 0.7753 (0.7820) time: 0.2658 data: 0.0001 max mem: 26157 Train: [96] [5600/6250] eta: 0:02:54 lr: 0.000000 grad: 0.2271 (0.4042) loss: 0.7826 (0.7820) time: 0.2663 data: 0.0001 max mem: 26157 Train: [96] [5700/6250] eta: 0:02:27 lr: 0.000000 grad: 0.2240 (0.4030) loss: 0.7798 (0.7819) time: 0.2679 data: 0.0002 max mem: 26157 Train: [96] [5800/6250] eta: 0:02:00 lr: 0.000000 grad: 0.2219 (0.4043) loss: 0.7830 (0.7819) time: 0.2669 data: 0.0001 max mem: 26157 Train: [96] [5900/6250] eta: 0:01:33 lr: 0.000000 grad: 0.2319 (0.4052) loss: 0.7797 (0.7819) time: 0.2673 data: 0.0001 max mem: 26157 Train: [96] [6000/6250] eta: 0:01:06 lr: 0.000000 grad: 0.2209 (0.4056) loss: 0.7800 (0.7819) time: 0.2673 data: 0.0001 max mem: 26157 Train: [96] [6100/6250] eta: 0:00:40 lr: 0.000000 grad: 0.2474 (0.4056) loss: 0.7865 (0.7819) time: 0.2671 data: 0.0001 max mem: 26157 Train: [96] [6200/6250] eta: 0:00:13 lr: 0.000000 grad: 0.2462 (0.4105) loss: 0.7828 (0.7819) time: 0.2674 data: 0.0001 max mem: 26157 Train: [96] [6249/6250] eta: 0:00:00 lr: 0.000000 grad: 0.2159 (0.4125) loss: 0.7834 (0.7819) time: 0.2665 data: 0.0001 max mem: 26157 Train: [96] Total time: 0:27:57 (0.2683 s / it) Averaged stats: lr: 0.000000 grad: 0.2159 (0.4125) loss: 0.7834 (0.7819) Eval (hcp-train-subset): [96] [ 0/62] eta: 0:04:02 loss: 0.7845 (0.7845) time: 3.9101 data: 3.8275 max mem: 26157 Eval (hcp-train-subset): [96] [61/62] eta: 0:00:00 loss: 0.7801 (0.7842) time: 0.0822 data: 0.0001 max mem: 26157 Eval (hcp-train-subset): [96] Total time: 0:00:10 (0.1649 s / it) Averaged stats (hcp-train-subset): loss: 0.7801 (0.7842) Making plots (hcp-train-subset): example=40 Eval (hcp-val): [96] [ 0/62] eta: 0:04:36 loss: 0.8175 (0.8175) time: 4.4570 data: 4.3737 max mem: 26157 Eval (hcp-val): [96] [61/62] eta: 0:00:00 loss: 0.8231 (0.8224) time: 0.0841 data: 0.0001 max mem: 26157 Eval (hcp-val): [96] Total time: 0:00:10 (0.1706 s / it) Averaged stats (hcp-val): loss: 0.8231 (0.8224) Making plots (hcp-val): example=55 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [97] [ 0/6250] eta: 6:25:49 lr: 0.000000 grad: 0.2990 (0.2990) loss: 0.8480 (0.8480) time: 3.7039 data: 3.3666 max mem: 26157 Train: [97] [ 100/6250] eta: 0:32:28 lr: 0.000000 grad: 0.2720 (0.4532) loss: 0.7932 (0.7922) time: 0.2677 data: 0.0001 max mem: 26157 Train: [97] [ 200/6250] eta: 0:29:25 lr: 0.000000 grad: 0.2916 (0.4127) loss: 0.7965 (0.7910) time: 0.2666 data: 0.0002 max mem: 26157 Train: [97] [ 300/6250] eta: 0:28:08 lr: 0.000000 grad: 0.2338 (0.3919) loss: 0.8031 (0.7925) time: 0.2683 data: 0.0001 max mem: 26157 Train: [97] [ 400/6250] eta: 0:27:16 lr: 0.000000 grad: 0.2335 (0.3670) loss: 0.7918 (0.7942) time: 0.2665 data: 0.0001 max mem: 26157 Train: [97] [ 500/6250] eta: 0:26:34 lr: 0.000000 grad: 0.2291 (0.3701) loss: 0.7883 (0.7937) time: 0.2677 data: 0.0002 max mem: 26157 Train: [97] [ 600/6250] eta: 0:25:57 lr: 0.000000 grad: 0.2308 (0.3727) loss: 0.7911 (0.7931) time: 0.2680 data: 0.0001 max mem: 26157 Train: [97] [ 700/6250] eta: 0:25:23 lr: 0.000000 grad: 0.2290 (0.3644) loss: 0.7824 (0.7923) time: 0.2684 data: 0.0001 max mem: 26157 Train: [97] [ 800/6250] eta: 0:24:50 lr: 0.000000 grad: 0.2218 (0.3517) loss: 0.7830 (0.7918) time: 0.2672 data: 0.0002 max mem: 26157 Train: [97] [ 900/6250] eta: 0:24:19 lr: 0.000000 grad: 0.2256 (0.3438) loss: 0.7916 (0.7913) time: 0.2670 data: 0.0001 max mem: 26157 Train: [97] [1000/6250] eta: 0:23:49 lr: 0.000000 grad: 0.2336 (0.3419) loss: 0.7768 (0.7910) time: 0.2674 data: 0.0001 max mem: 26157 Train: [97] [1100/6250] eta: 0:23:19 lr: 0.000000 grad: 0.2216 (0.3499) loss: 0.7855 (0.7906) time: 0.2670 data: 0.0001 max mem: 26157 Train: [97] [1200/6250] eta: 0:22:50 lr: 0.000000 grad: 0.2189 (0.3583) loss: 0.7826 (0.7903) time: 0.2664 data: 0.0001 max mem: 26157 Train: [97] [1300/6250] eta: 0:22:22 lr: 0.000000 grad: 0.2131 (0.3533) loss: 0.7919 (0.7902) time: 0.2665 data: 0.0001 max mem: 26157 Train: [97] [1400/6250] eta: 0:21:53 lr: 0.000000 grad: 0.2068 (0.3490) loss: 0.7879 (0.7900) time: 0.2675 data: 0.0001 max mem: 26157 Train: [97] [1500/6250] eta: 0:21:25 lr: 0.000000 grad: 0.2230 (0.3507) loss: 0.7792 (0.7894) time: 0.2675 data: 0.0001 max mem: 26157 Train: [97] [1600/6250] eta: 0:20:57 lr: 0.000000 grad: 0.2406 (0.3499) loss: 0.7890 (0.7892) time: 0.2671 data: 0.0001 max mem: 26157 Train: [97] [1700/6250] eta: 0:20:29 lr: 0.000000 grad: 0.2213 (0.3469) loss: 0.7785 (0.7889) time: 0.2668 data: 0.0002 max mem: 26157 Train: [97] [1800/6250] eta: 0:20:01 lr: 0.000000 grad: 0.2229 (0.3487) loss: 0.7856 (0.7887) time: 0.2671 data: 0.0001 max mem: 26157 Train: [97] [1900/6250] eta: 0:19:33 lr: 0.000000 grad: 0.2373 (0.3520) loss: 0.7842 (0.7887) time: 0.2671 data: 0.0001 max mem: 26157 Train: [97] [2000/6250] eta: 0:19:06 lr: 0.000000 grad: 0.2281 (0.3616) loss: 0.7853 (0.7885) time: 0.2664 data: 0.0001 max mem: 26157 Train: [97] [2100/6250] eta: 0:18:38 lr: 0.000000 grad: 0.2173 (0.3608) loss: 0.7814 (0.7885) time: 0.2660 data: 0.0001 max mem: 26157 Train: [97] [2200/6250] eta: 0:18:11 lr: 0.000000 grad: 0.2168 (0.3667) loss: 0.7966 (0.7886) time: 0.2677 data: 0.0001 max mem: 26157 Train: [97] [2300/6250] eta: 0:17:43 lr: 0.000000 grad: 0.2324 (0.3643) loss: 0.7896 (0.7887) time: 0.2668 data: 0.0001 max mem: 26157 Train: [97] [2400/6250] eta: 0:17:16 lr: 0.000000 grad: 0.2282 (0.3644) loss: 0.7862 (0.7886) time: 0.2669 data: 0.0001 max mem: 26157 Train: [97] [2500/6250] eta: 0:16:49 lr: 0.000000 grad: 0.2230 (0.3668) loss: 0.7859 (0.7886) time: 0.2668 data: 0.0002 max mem: 26157 Train: [97] [2600/6250] eta: 0:16:22 lr: 0.000000 grad: 0.2033 (0.3649) loss: 0.7937 (0.7884) time: 0.2672 data: 0.0002 max mem: 26157 Train: [97] [2700/6250] eta: 0:15:54 lr: 0.000000 grad: 0.2255 (0.3628) loss: 0.7828 (0.7883) time: 0.2657 data: 0.0001 max mem: 26157 Train: [97] [2800/6250] eta: 0:15:27 lr: 0.000000 grad: 0.2263 (0.3600) loss: 0.7806 (0.7881) time: 0.2673 data: 0.0001 max mem: 26157 Train: [97] [2900/6250] eta: 0:15:00 lr: 0.000000 grad: 0.2264 (0.3589) loss: 0.7787 (0.7879) time: 0.2668 data: 0.0001 max mem: 26157 Train: [97] [3000/6250] eta: 0:14:33 lr: 0.000000 grad: 0.2304 (0.3585) loss: 0.7862 (0.7879) time: 0.2665 data: 0.0001 max mem: 26157 Train: [97] [3100/6250] eta: 0:14:06 lr: 0.000000 grad: 0.2160 (0.3575) loss: 0.7851 (0.7877) time: 0.2668 data: 0.0001 max mem: 26157 Train: [97] [3200/6250] eta: 0:13:39 lr: 0.000000 grad: 0.2192 (0.3620) loss: 0.7845 (0.7875) time: 0.2667 data: 0.0001 max mem: 26157 Train: [97] [3300/6250] eta: 0:13:12 lr: 0.000000 grad: 0.2409 (0.3643) loss: 0.7887 (0.7875) time: 0.2678 data: 0.0002 max mem: 26157 Train: [97] [3400/6250] eta: 0:12:45 lr: 0.000000 grad: 0.2503 (0.3672) loss: 0.7821 (0.7874) time: 0.2665 data: 0.0001 max mem: 26157 Train: [97] [3500/6250] eta: 0:12:18 lr: 0.000000 grad: 0.2359 (0.3717) loss: 0.7812 (0.7873) time: 0.2675 data: 0.0001 max mem: 26157 Train: [97] [3600/6250] eta: 0:11:51 lr: 0.000000 grad: 0.2396 (0.3711) loss: 0.7827 (0.7873) time: 0.2666 data: 0.0001 max mem: 26157 Train: [97] [3700/6250] eta: 0:11:24 lr: 0.000000 grad: 0.2009 (0.3746) loss: 0.7816 (0.7872) time: 0.2663 data: 0.0001 max mem: 26157 Train: [97] [3800/6250] eta: 0:10:57 lr: 0.000000 grad: 0.2376 (0.3720) loss: 0.7867 (0.7872) time: 0.2668 data: 0.0001 max mem: 26157 Train: [97] [3900/6250] eta: 0:10:30 lr: 0.000000 grad: 0.2327 (0.3734) loss: 0.7825 (0.7872) time: 0.2675 data: 0.0001 max mem: 26157 Train: [97] [4000/6250] eta: 0:10:03 lr: 0.000000 grad: 0.2175 (0.3742) loss: 0.7818 (0.7872) time: 0.2660 data: 0.0001 max mem: 26157 Train: [97] [4100/6250] eta: 0:09:36 lr: 0.000000 grad: 0.2113 (0.3753) loss: 0.7819 (0.7872) time: 0.2678 data: 0.0001 max mem: 26157 Train: [97] [4200/6250] eta: 0:09:09 lr: 0.000000 grad: 0.2229 (0.3740) loss: 0.7791 (0.7871) time: 0.2702 data: 0.0002 max mem: 26157 Train: [97] [4300/6250] eta: 0:08:42 lr: 0.000000 grad: 0.2375 (0.3750) loss: 0.7815 (0.7870) time: 0.2673 data: 0.0001 max mem: 26157 Train: [97] [4400/6250] eta: 0:08:16 lr: 0.000000 grad: 0.2170 (0.3742) loss: 0.7859 (0.7869) time: 0.2711 data: 0.0002 max mem: 26157 Train: [97] [4500/6250] eta: 0:07:49 lr: 0.000000 grad: 0.2183 (0.3824) loss: 0.7812 (0.7868) time: 0.2672 data: 0.0001 max mem: 26157 Train: [97] [4600/6250] eta: 0:07:22 lr: 0.000000 grad: 0.2111 (0.3796) loss: 0.7899 (0.7868) time: 0.2669 data: 0.0002 max mem: 26157 Train: [97] [4700/6250] eta: 0:06:55 lr: 0.000000 grad: 0.2295 (0.3796) loss: 0.7888 (0.7867) time: 0.2667 data: 0.0001 max mem: 26157 Train: [97] [4800/6250] eta: 0:06:28 lr: 0.000000 grad: 0.2221 (0.3862) loss: 0.7867 (0.7866) time: 0.2662 data: 0.0001 max mem: 26157 Train: [97] [4900/6250] eta: 0:06:01 lr: 0.000000 grad: 0.2159 (0.3848) loss: 0.7887 (0.7865) time: 0.2679 data: 0.0002 max mem: 26157 Train: [97] [5000/6250] eta: 0:05:35 lr: 0.000000 grad: 0.2216 (0.3909) loss: 0.7827 (0.7864) time: 0.2673 data: 0.0001 max mem: 26157 Train: [97] [5100/6250] eta: 0:05:08 lr: 0.000000 grad: 0.2327 (0.3908) loss: 0.7755 (0.7864) time: 0.2662 data: 0.0001 max mem: 26157 Train: [97] [5200/6250] eta: 0:04:41 lr: 0.000000 grad: 0.2283 (0.3912) loss: 0.7799 (0.7863) time: 0.2674 data: 0.0002 max mem: 26157 Train: [97] [5300/6250] eta: 0:04:14 lr: 0.000000 grad: 0.2425 (0.3921) loss: 0.7787 (0.7863) time: 0.2674 data: 0.0001 max mem: 26157 Train: [97] [5400/6250] eta: 0:03:47 lr: 0.000000 grad: 0.2111 (0.3902) loss: 0.7931 (0.7863) time: 0.2667 data: 0.0001 max mem: 26157 Train: [97] [5500/6250] eta: 0:03:21 lr: 0.000000 grad: 0.2229 (0.3907) loss: 0.7916 (0.7863) time: 0.2688 data: 0.0002 max mem: 26157 Train: [97] [5600/6250] eta: 0:02:54 lr: 0.000000 grad: 0.2357 (0.3906) loss: 0.7936 (0.7863) time: 0.2666 data: 0.0001 max mem: 26157 Train: [97] [5700/6250] eta: 0:02:27 lr: 0.000000 grad: 0.2242 (0.3933) loss: 0.7878 (0.7863) time: 0.2663 data: 0.0002 max mem: 26157 Train: [97] [5800/6250] eta: 0:02:00 lr: 0.000000 grad: 0.2325 (0.3917) loss: 0.7879 (0.7864) time: 0.2671 data: 0.0002 max mem: 26157 Train: [97] [5900/6250] eta: 0:01:33 lr: 0.000000 grad: 0.2411 (0.3955) loss: 0.7887 (0.7864) time: 0.2676 data: 0.0001 max mem: 26157 Train: [97] [6000/6250] eta: 0:01:06 lr: 0.000000 grad: 0.2288 (0.3945) loss: 0.7916 (0.7864) time: 0.2662 data: 0.0001 max mem: 26157 Train: [97] [6100/6250] eta: 0:00:40 lr: 0.000000 grad: 0.2364 (0.3931) loss: 0.7906 (0.7864) time: 0.2671 data: 0.0001 max mem: 26157 Train: [97] [6200/6250] eta: 0:00:13 lr: 0.000000 grad: 0.2269 (0.3909) loss: 0.7878 (0.7864) time: 0.2669 data: 0.0001 max mem: 26157 Train: [97] [6249/6250] eta: 0:00:00 lr: 0.000000 grad: 0.2358 (0.3898) loss: 0.7849 (0.7864) time: 0.2665 data: 0.0001 max mem: 26157 Train: [97] Total time: 0:27:57 (0.2684 s / it) Averaged stats: lr: 0.000000 grad: 0.2358 (0.3898) loss: 0.7849 (0.7864) Eval (hcp-train-subset): [97] [ 0/62] eta: 0:04:20 loss: 0.7849 (0.7849) time: 4.2078 data: 4.1247 max mem: 26157 Eval (hcp-train-subset): [97] [61/62] eta: 0:00:00 loss: 0.7781 (0.7838) time: 0.0947 data: 0.0127 max mem: 26157 Eval (hcp-train-subset): [97] Total time: 0:00:10 (0.1689 s / it) Averaged stats (hcp-train-subset): loss: 0.7781 (0.7838) Making plots (hcp-train-subset): example=17 Eval (hcp-val): [97] [ 0/62] eta: 0:04:33 loss: 0.8193 (0.8193) time: 4.4086 data: 4.3257 max mem: 26157 Eval (hcp-val): [97] [61/62] eta: 0:00:00 loss: 0.8216 (0.8224) time: 0.0923 data: 0.0084 max mem: 26157 Eval (hcp-val): [97] Total time: 0:00:10 (0.1679 s / it) Averaged stats (hcp-val): loss: 0.8216 (0.8224) Making plots (hcp-val): example=50 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [98] [ 0/6250] eta: 8:38:52 lr: 0.000000 grad: 0.5456 (0.5456) loss: 0.8337 (0.8337) time: 4.9811 data: 4.6967 max mem: 26157 Train: [98] [ 100/6250] eta: 0:33:05 lr: 0.000000 grad: 0.3186 (0.6388) loss: 0.7771 (0.7832) time: 0.2667 data: 0.0001 max mem: 26157 Train: [98] [ 200/6250] eta: 0:29:45 lr: 0.000000 grad: 0.2837 (0.4885) loss: 0.7871 (0.7842) time: 0.2686 data: 0.0001 max mem: 26157 Train: [98] [ 300/6250] eta: 0:28:19 lr: 0.000000 grad: 0.2450 (0.4376) loss: 0.7869 (0.7853) time: 0.2660 data: 0.0001 max mem: 26157 Train: [98] [ 400/6250] eta: 0:27:23 lr: 0.000000 grad: 0.2126 (0.3964) loss: 0.7997 (0.7881) time: 0.2670 data: 0.0002 max mem: 26157 Train: [98] [ 500/6250] eta: 0:26:39 lr: 0.000000 grad: 0.2220 (0.4066) loss: 0.7960 (0.7879) time: 0.2666 data: 0.0001 max mem: 26157 Train: [98] [ 600/6250] eta: 0:26:01 lr: 0.000000 grad: 0.2432 (0.4181) loss: 0.7848 (0.7872) time: 0.2668 data: 0.0001 max mem: 26157 Train: [98] [ 700/6250] eta: 0:25:26 lr: 0.000000 grad: 0.2535 (0.4379) loss: 0.7897 (0.7871) time: 0.2666 data: 0.0001 max mem: 26157 Train: [98] [ 800/6250] eta: 0:24:53 lr: 0.000000 grad: 0.2411 (0.4345) loss: 0.7854 (0.7868) time: 0.2667 data: 0.0001 max mem: 26157 Train: [98] [ 900/6250] eta: 0:24:22 lr: 0.000000 grad: 0.2259 (0.4232) loss: 0.8004 (0.7873) time: 0.2664 data: 0.0001 max mem: 26157 Train: [98] [1000/6250] eta: 0:23:51 lr: 0.000000 grad: 0.2292 (0.4092) loss: 0.7856 (0.7872) time: 0.2686 data: 0.0002 max mem: 26157 Train: [98] [1100/6250] eta: 0:23:21 lr: 0.000000 grad: 0.2311 (0.4063) loss: 0.7757 (0.7869) time: 0.2665 data: 0.0001 max mem: 26157 Train: [98] [1200/6250] eta: 0:22:52 lr: 0.000000 grad: 0.2301 (0.4034) loss: 0.7997 (0.7870) time: 0.2682 data: 0.0002 max mem: 26157 Train: [98] [1300/6250] eta: 0:22:23 lr: 0.000000 grad: 0.2396 (0.3953) loss: 0.7925 (0.7871) time: 0.2672 data: 0.0001 max mem: 26157 Train: [98] [1400/6250] eta: 0:21:54 lr: 0.000000 grad: 0.2395 (0.3994) loss: 0.7850 (0.7871) time: 0.2679 data: 0.0002 max mem: 26157 Train: [98] [1500/6250] eta: 0:21:26 lr: 0.000000 grad: 0.2226 (0.3924) loss: 0.7844 (0.7873) time: 0.2665 data: 0.0001 max mem: 26157 Train: [98] [1600/6250] eta: 0:20:58 lr: 0.000000 grad: 0.2422 (0.3957) loss: 0.7878 (0.7875) time: 0.2664 data: 0.0001 max mem: 26157 Train: [98] [1700/6250] eta: 0:20:30 lr: 0.000000 grad: 0.2250 (0.3977) loss: 0.7906 (0.7875) time: 0.2662 data: 0.0001 max mem: 26157 Train: [98] [1800/6250] eta: 0:20:02 lr: 0.000000 grad: 0.2191 (0.3957) loss: 0.7922 (0.7876) time: 0.2668 data: 0.0002 max mem: 26157 Train: [98] [1900/6250] eta: 0:19:34 lr: 0.000000 grad: 0.2093 (0.3974) loss: 0.7915 (0.7877) time: 0.2668 data: 0.0001 max mem: 26157 Train: [98] [2000/6250] eta: 0:19:07 lr: 0.000000 grad: 0.2069 (0.3951) loss: 0.7958 (0.7879) time: 0.2674 data: 0.0002 max mem: 26157 Train: [98] [2100/6250] eta: 0:18:39 lr: 0.000000 grad: 0.2033 (0.3888) loss: 0.7874 (0.7880) time: 0.2665 data: 0.0001 max mem: 26157 Train: [98] [2200/6250] eta: 0:18:12 lr: 0.000000 grad: 0.2244 (0.3947) loss: 0.7850 (0.7880) time: 0.2663 data: 0.0001 max mem: 26157 Train: [98] [2300/6250] eta: 0:17:44 lr: 0.000000 grad: 0.2166 (0.3934) loss: 0.7861 (0.7881) time: 0.2667 data: 0.0001 max mem: 26157 Train: [98] [2400/6250] eta: 0:17:17 lr: 0.000000 grad: 0.2080 (0.3934) loss: 0.7887 (0.7882) time: 0.2670 data: 0.0001 max mem: 26157 Train: [98] [2500/6250] eta: 0:16:50 lr: 0.000000 grad: 0.2189 (0.4045) loss: 0.7876 (0.7881) time: 0.2674 data: 0.0001 max mem: 26157 Train: [98] [2600/6250] eta: 0:16:22 lr: 0.000000 grad: 0.2177 (0.4032) loss: 0.7929 (0.7881) time: 0.2678 data: 0.0001 max mem: 26157 Train: [98] [2700/6250] eta: 0:15:55 lr: 0.000000 grad: 0.2304 (0.4049) loss: 0.7860 (0.7880) time: 0.2661 data: 0.0001 max mem: 26157 Train: [98] [2800/6250] eta: 0:15:28 lr: 0.000000 grad: 0.2190 (0.4038) loss: 0.7853 (0.7879) time: 0.2674 data: 0.0001 max mem: 26157 Train: [98] [2900/6250] eta: 0:15:01 lr: 0.000000 grad: 0.2260 (0.4021) loss: 0.7848 (0.7879) time: 0.2671 data: 0.0001 max mem: 26157 Train: [98] [3000/6250] eta: 0:14:34 lr: 0.000000 grad: 0.2391 (0.4018) loss: 0.7812 (0.7878) time: 0.2672 data: 0.0001 max mem: 26157 Train: [98] [3100/6250] eta: 0:14:07 lr: 0.000000 grad: 0.2189 (0.4046) loss: 0.7828 (0.7876) time: 0.2685 data: 0.0002 max mem: 26157 Train: [98] [3200/6250] eta: 0:13:40 lr: 0.000000 grad: 0.2347 (0.4045) loss: 0.7834 (0.7874) time: 0.2681 data: 0.0002 max mem: 26157 Train: [98] [3300/6250] eta: 0:13:13 lr: 0.000000 grad: 0.2180 (0.4021) loss: 0.7847 (0.7874) time: 0.2689 data: 0.0001 max mem: 26157 Train: [98] [3400/6250] eta: 0:12:46 lr: 0.000000 grad: 0.2136 (0.4006) loss: 0.7870 (0.7874) time: 0.2676 data: 0.0002 max mem: 26157 Train: [98] [3500/6250] eta: 0:12:19 lr: 0.000000 grad: 0.2401 (0.3987) loss: 0.7864 (0.7873) time: 0.2679 data: 0.0001 max mem: 26157 Train: [98] [3600/6250] eta: 0:11:52 lr: 0.000000 grad: 0.2222 (0.4021) loss: 0.7789 (0.7872) time: 0.2663 data: 0.0001 max mem: 26157 Train: [98] [3700/6250] eta: 0:11:25 lr: 0.000000 grad: 0.2110 (0.4027) loss: 0.7833 (0.7872) time: 0.2672 data: 0.0002 max mem: 26157 Train: [98] [3800/6250] eta: 0:10:58 lr: 0.000000 grad: 0.2167 (0.4061) loss: 0.7912 (0.7871) time: 0.2665 data: 0.0001 max mem: 26157 Train: [98] [3900/6250] eta: 0:10:31 lr: 0.000000 grad: 0.2170 (0.4104) loss: 0.7895 (0.7871) time: 0.2671 data: 0.0001 max mem: 26157 Train: [98] [4000/6250] eta: 0:10:04 lr: 0.000000 grad: 0.2247 (0.4083) loss: 0.7845 (0.7871) time: 0.2663 data: 0.0001 max mem: 26157 Train: [98] [4100/6250] eta: 0:09:37 lr: 0.000000 grad: 0.2086 (0.4093) loss: 0.7853 (0.7871) time: 0.2666 data: 0.0001 max mem: 26157 Train: [98] [4200/6250] eta: 0:09:10 lr: 0.000000 grad: 0.2215 (0.4074) loss: 0.7802 (0.7870) time: 0.2669 data: 0.0001 max mem: 26157 Train: [98] [4300/6250] eta: 0:08:43 lr: 0.000000 grad: 0.2219 (0.4087) loss: 0.7853 (0.7871) time: 0.2672 data: 0.0001 max mem: 26157 Train: [98] [4400/6250] eta: 0:08:16 lr: 0.000000 grad: 0.2350 (0.4049) loss: 0.7896 (0.7871) time: 0.2674 data: 0.0001 max mem: 26157 Train: [98] [4500/6250] eta: 0:07:49 lr: 0.000000 grad: 0.2351 (0.4074) loss: 0.7937 (0.7870) time: 0.2673 data: 0.0002 max mem: 26157 Train: [98] [4600/6250] eta: 0:07:22 lr: 0.000000 grad: 0.2236 (0.4044) loss: 0.7847 (0.7870) time: 0.2665 data: 0.0002 max mem: 26157 Train: [98] [4700/6250] eta: 0:06:55 lr: 0.000000 grad: 0.2239 (0.4027) loss: 0.7843 (0.7869) time: 0.2665 data: 0.0001 max mem: 26157 Train: [98] [4800/6250] eta: 0:06:28 lr: 0.000000 grad: 0.2362 (0.4034) loss: 0.7809 (0.7869) time: 0.2664 data: 0.0001 max mem: 26157 Train: [98] [4900/6250] eta: 0:06:02 lr: 0.000000 grad: 0.2228 (0.4019) loss: 0.7862 (0.7869) time: 0.2674 data: 0.0001 max mem: 26157 Train: [98] [5000/6250] eta: 0:05:35 lr: 0.000000 grad: 0.2208 (0.4003) loss: 0.7816 (0.7869) time: 0.2662 data: 0.0001 max mem: 26157 Train: [98] [5100/6250] eta: 0:05:08 lr: 0.000000 grad: 0.2167 (0.3977) loss: 0.7842 (0.7869) time: 0.2663 data: 0.0001 max mem: 26157 Train: [98] [5200/6250] eta: 0:04:41 lr: 0.000000 grad: 0.2411 (0.3979) loss: 0.7823 (0.7868) time: 0.2674 data: 0.0001 max mem: 26157 Train: [98] [5300/6250] eta: 0:04:14 lr: 0.000000 grad: 0.2016 (0.3971) loss: 0.7883 (0.7868) time: 0.2669 data: 0.0001 max mem: 26157 Train: [98] [5400/6250] eta: 0:03:47 lr: 0.000000 grad: 0.2283 (0.3968) loss: 0.7881 (0.7868) time: 0.2665 data: 0.0001 max mem: 26157 Train: [98] [5500/6250] eta: 0:03:21 lr: 0.000000 grad: 0.2266 (0.3956) loss: 0.7907 (0.7868) time: 0.2671 data: 0.0001 max mem: 26157 Train: [98] [5600/6250] eta: 0:02:54 lr: 0.000000 grad: 0.2393 (0.3933) loss: 0.7866 (0.7868) time: 0.2669 data: 0.0001 max mem: 26157 Train: [98] [5700/6250] eta: 0:02:27 lr: 0.000000 grad: 0.2249 (0.3954) loss: 0.7851 (0.7868) time: 0.2668 data: 0.0002 max mem: 26157 Train: [98] [5800/6250] eta: 0:02:00 lr: 0.000000 grad: 0.2351 (0.3987) loss: 0.7828 (0.7868) time: 0.2665 data: 0.0001 max mem: 26157 Train: [98] [5900/6250] eta: 0:01:33 lr: 0.000000 grad: 0.2383 (0.3989) loss: 0.7866 (0.7868) time: 0.2668 data: 0.0001 max mem: 26157 Train: [98] [6000/6250] eta: 0:01:07 lr: 0.000000 grad: 0.2187 (0.3977) loss: 0.7826 (0.7868) time: 0.2675 data: 0.0001 max mem: 26157 Train: [98] [6100/6250] eta: 0:00:40 lr: 0.000000 grad: 0.2140 (0.3965) loss: 0.7930 (0.7868) time: 0.2665 data: 0.0001 max mem: 26157 Train: [98] [6200/6250] eta: 0:00:13 lr: 0.000000 grad: 0.2126 (0.3986) loss: 0.7946 (0.7869) time: 0.2665 data: 0.0001 max mem: 26157 Train: [98] [6249/6250] eta: 0:00:00 lr: 0.000000 grad: 0.2208 (0.3981) loss: 0.7855 (0.7869) time: 0.2668 data: 0.0001 max mem: 26157 Train: [98] Total time: 0:27:58 (0.2685 s / it) Averaged stats: lr: 0.000000 grad: 0.2208 (0.3981) loss: 0.7855 (0.7869) Eval (hcp-train-subset): [98] [ 0/62] eta: 0:04:31 loss: 0.7857 (0.7857) time: 4.3772 data: 4.2943 max mem: 26157 Eval (hcp-train-subset): [98] [61/62] eta: 0:00:00 loss: 0.7796 (0.7839) time: 0.0876 data: 0.0053 max mem: 26157 Eval (hcp-train-subset): [98] Total time: 0:00:10 (0.1685 s / it) Averaged stats (hcp-train-subset): loss: 0.7796 (0.7839) Making plots (hcp-train-subset): example=3 Eval (hcp-val): [98] [ 0/62] eta: 0:03:27 loss: 0.8152 (0.8152) time: 3.3483 data: 3.2522 max mem: 26157 Eval (hcp-val): [98] [61/62] eta: 0:00:00 loss: 0.8208 (0.8218) time: 0.0886 data: 0.0068 max mem: 26157 Eval (hcp-val): [98] Total time: 0:00:10 (0.1684 s / it) Averaged stats (hcp-val): loss: 0.8208 (0.8218) Making plots (hcp-val): example=32 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth Train: [99] [ 0/6250] eta: 6:24:00 lr: 0.000000 grad: 0.2214 (0.2214) loss: 0.8396 (0.8396) time: 3.6865 data: 3.3715 max mem: 26157 Train: [99] [ 100/6250] eta: 0:31:11 lr: 0.000000 grad: 0.2967 (0.3555) loss: 0.7971 (0.8034) time: 0.2684 data: 0.0001 max mem: 26157 Train: [99] [ 200/6250] eta: 0:28:48 lr: 0.000000 grad: 0.2607 (0.3916) loss: 0.8062 (0.8031) time: 0.2667 data: 0.0001 max mem: 26157 Train: [99] [ 300/6250] eta: 0:27:44 lr: 0.000000 grad: 0.2677 (0.3983) loss: 0.7878 (0.7996) time: 0.2671 data: 0.0002 max mem: 26157 Train: [99] [ 400/6250] eta: 0:26:57 lr: 0.000000 grad: 0.2698 (0.3808) loss: 0.7833 (0.7962) time: 0.2675 data: 0.0001 max mem: 26157 Train: [99] [ 500/6250] eta: 0:26:18 lr: 0.000000 grad: 0.2739 (0.3861) loss: 0.7846 (0.7934) time: 0.2667 data: 0.0001 max mem: 26157 Train: [99] [ 600/6250] eta: 0:25:44 lr: 0.000000 grad: 0.2259 (0.3849) loss: 0.7796 (0.7923) time: 0.2670 data: 0.0001 max mem: 26157 Train: [99] [ 700/6250] eta: 0:25:12 lr: 0.000000 grad: 0.2380 (0.3690) loss: 0.7848 (0.7913) time: 0.2671 data: 0.0001 max mem: 26157 Train: [99] [ 800/6250] eta: 0:24:41 lr: 0.000000 grad: 0.2552 (0.3783) loss: 0.7880 (0.7903) time: 0.2678 data: 0.0001 max mem: 26157 Train: [99] [ 900/6250] eta: 0:24:11 lr: 0.000000 grad: 0.2596 (0.3910) loss: 0.7814 (0.7897) time: 0.2676 data: 0.0002 max mem: 26157 Train: [99] [1000/6250] eta: 0:23:42 lr: 0.000000 grad: 0.2411 (0.3930) loss: 0.7871 (0.7895) time: 0.2672 data: 0.0002 max mem: 26157 Train: [99] [1100/6250] eta: 0:23:13 lr: 0.000000 grad: 0.2246 (0.3905) loss: 0.7849 (0.7893) time: 0.2665 data: 0.0001 max mem: 26157 Train: [99] [1200/6250] eta: 0:22:44 lr: 0.000000 grad: 0.2519 (0.3828) loss: 0.7780 (0.7889) time: 0.2675 data: 0.0001 max mem: 26157 Train: [99] [1300/6250] eta: 0:22:16 lr: 0.000000 grad: 0.2502 (0.3828) loss: 0.7736 (0.7887) time: 0.2681 data: 0.0002 max mem: 26157 Train: [99] [1400/6250] eta: 0:21:48 lr: 0.000000 grad: 0.2556 (0.3848) loss: 0.7877 (0.7887) time: 0.2663 data: 0.0001 max mem: 26157 Train: [99] [1500/6250] eta: 0:21:20 lr: 0.000000 grad: 0.2308 (0.3801) loss: 0.7829 (0.7885) time: 0.2664 data: 0.0001 max mem: 26157 Train: [99] [1600/6250] eta: 0:20:52 lr: 0.000000 grad: 0.2236 (0.3728) loss: 0.7876 (0.7885) time: 0.2671 data: 0.0001 max mem: 26157 Train: [99] [1700/6250] eta: 0:20:25 lr: 0.000000 grad: 0.2251 (0.3672) loss: 0.7875 (0.7884) time: 0.2668 data: 0.0001 max mem: 26157 Train: [99] [1800/6250] eta: 0:19:57 lr: 0.000000 grad: 0.2383 (0.3754) loss: 0.7864 (0.7883) time: 0.2664 data: 0.0002 max mem: 26157 Train: [99] [1900/6250] eta: 0:19:30 lr: 0.000000 grad: 0.2181 (0.3740) loss: 0.7864 (0.7881) time: 0.2676 data: 0.0002 max mem: 26157 Train: [99] [2000/6250] eta: 0:19:02 lr: 0.000000 grad: 0.2120 (0.3740) loss: 0.7830 (0.7881) time: 0.2666 data: 0.0001 max mem: 26157 Train: [99] [2100/6250] eta: 0:18:35 lr: 0.000000 grad: 0.2318 (0.3685) loss: 0.7869 (0.7882) time: 0.2665 data: 0.0001 max mem: 26157 Train: [99] [2200/6250] eta: 0:18:08 lr: 0.000000 grad: 0.2271 (0.3675) loss: 0.7791 (0.7881) time: 0.2677 data: 0.0002 max mem: 26157 Train: [99] [2300/6250] eta: 0:17:41 lr: 0.000000 grad: 0.2255 (0.3663) loss: 0.7826 (0.7881) time: 0.2671 data: 0.0001 max mem: 26157 Train: [99] [2400/6250] eta: 0:17:14 lr: 0.000000 grad: 0.2248 (0.3622) loss: 0.7849 (0.7880) time: 0.2685 data: 0.0002 max mem: 26157 Train: [99] [2500/6250] eta: 0:16:47 lr: 0.000000 grad: 0.2210 (0.3595) loss: 0.7916 (0.7881) time: 0.2668 data: 0.0001 max mem: 26157 Train: [99] [2600/6250] eta: 0:16:20 lr: 0.000000 grad: 0.2223 (0.3548) loss: 0.7889 (0.7882) time: 0.2657 data: 0.0001 max mem: 26157 Train: [99] [2700/6250] eta: 0:15:53 lr: 0.000000 grad: 0.2336 (0.3526) loss: 0.7878 (0.7881) time: 0.2674 data: 0.0001 max mem: 26157 Train: [99] [2800/6250] eta: 0:15:26 lr: 0.000000 grad: 0.2192 (0.3545) loss: 0.7918 (0.7881) time: 0.2669 data: 0.0001 max mem: 26157 Train: [99] [2900/6250] eta: 0:14:59 lr: 0.000000 grad: 0.2226 (0.3559) loss: 0.7820 (0.7881) time: 0.2665 data: 0.0001 max mem: 26157 Train: [99] [3000/6250] eta: 0:14:32 lr: 0.000000 grad: 0.2288 (0.3534) loss: 0.7871 (0.7880) time: 0.2670 data: 0.0001 max mem: 26157 Train: [99] [3100/6250] eta: 0:14:05 lr: 0.000000 grad: 0.2135 (0.3571) loss: 0.7915 (0.7879) time: 0.2674 data: 0.0001 max mem: 26157 Train: [99] [3200/6250] eta: 0:13:38 lr: 0.000000 grad: 0.2282 (0.3559) loss: 0.7837 (0.7879) time: 0.2662 data: 0.0001 max mem: 26157 Train: [99] [3300/6250] eta: 0:13:11 lr: 0.000000 grad: 0.2153 (0.3541) loss: 0.7824 (0.7878) time: 0.2664 data: 0.0001 max mem: 26157 Train: [99] [3400/6250] eta: 0:12:44 lr: 0.000000 grad: 0.2237 (0.3552) loss: 0.7830 (0.7878) time: 0.2672 data: 0.0001 max mem: 26157 Train: [99] [3500/6250] eta: 0:12:17 lr: 0.000000 grad: 0.2271 (0.3534) loss: 0.7857 (0.7879) time: 0.2672 data: 0.0002 max mem: 26157 Train: [99] [3600/6250] eta: 0:11:50 lr: 0.000000 grad: 0.2302 (0.3512) loss: 0.7839 (0.7879) time: 0.2667 data: 0.0001 max mem: 26157 Train: [99] [3700/6250] eta: 0:11:23 lr: 0.000000 grad: 0.2294 (0.3543) loss: 0.7909 (0.7879) time: 0.2679 data: 0.0002 max mem: 26157 Train: [99] [3800/6250] eta: 0:10:56 lr: 0.000000 grad: 0.2414 (0.3541) loss: 0.7787 (0.7879) time: 0.2668 data: 0.0001 max mem: 26157 Train: [99] [3900/6250] eta: 0:10:29 lr: 0.000000 grad: 0.2263 (0.3520) loss: 0.7923 (0.7879) time: 0.2685 data: 0.0002 max mem: 26157 Train: [99] [4000/6250] eta: 0:10:02 lr: 0.000000 grad: 0.2147 (0.3527) loss: 0.7936 (0.7880) time: 0.2665 data: 0.0001 max mem: 26157 Train: [99] [4100/6250] eta: 0:09:36 lr: 0.000000 grad: 0.2123 (0.3508) loss: 0.7863 (0.7880) time: 0.2668 data: 0.0001 max mem: 26157 Train: [99] [4200/6250] eta: 0:09:09 lr: 0.000000 grad: 0.2194 (0.3501) loss: 0.7964 (0.7881) time: 0.2666 data: 0.0001 max mem: 26157 Train: [99] [4300/6250] eta: 0:08:42 lr: 0.000000 grad: 0.2241 (0.3485) loss: 0.7964 (0.7882) time: 0.2673 data: 0.0002 max mem: 26157 Train: [99] [4400/6250] eta: 0:08:15 lr: 0.000000 grad: 0.2099 (0.3478) loss: 0.8008 (0.7883) time: 0.2665 data: 0.0001 max mem: 26157 Train: [99] [4500/6250] eta: 0:07:48 lr: 0.000000 grad: 0.2206 (0.3489) loss: 0.7873 (0.7884) time: 0.2664 data: 0.0001 max mem: 26157 Train: [99] [4600/6250] eta: 0:07:21 lr: 0.000000 grad: 0.2358 (0.3486) loss: 0.7905 (0.7884) time: 0.2668 data: 0.0001 max mem: 26157 Train: [99] [4700/6250] eta: 0:06:55 lr: 0.000000 grad: 0.2454 (0.3477) loss: 0.7845 (0.7884) time: 0.2669 data: 0.0001 max mem: 26157 Train: [99] [4800/6250] eta: 0:06:28 lr: 0.000000 grad: 0.2362 (0.3488) loss: 0.7894 (0.7884) time: 0.2666 data: 0.0001 max mem: 26157 Train: [99] [4900/6250] eta: 0:06:01 lr: 0.000000 grad: 0.2307 (0.3479) loss: 0.7848 (0.7884) time: 0.2666 data: 0.0001 max mem: 26157 Train: [99] [5000/6250] eta: 0:05:34 lr: 0.000000 grad: 0.2480 (0.3492) loss: 0.7874 (0.7883) time: 0.2666 data: 0.0001 max mem: 26157 Train: [99] [5100/6250] eta: 0:05:07 lr: 0.000000 grad: 0.2325 (0.3509) loss: 0.7858 (0.7884) time: 0.2677 data: 0.0001 max mem: 26157 Train: [99] [5200/6250] eta: 0:04:41 lr: 0.000000 grad: 0.2395 (0.3505) loss: 0.7911 (0.7884) time: 0.2665 data: 0.0001 max mem: 26157 Train: [99] [5300/6250] eta: 0:04:14 lr: 0.000000 grad: 0.2259 (0.3525) loss: 0.7950 (0.7884) time: 0.2668 data: 0.0001 max mem: 26157 Train: [99] [5400/6250] eta: 0:03:47 lr: 0.000000 grad: 0.2224 (0.3542) loss: 0.7960 (0.7885) time: 0.2662 data: 0.0001 max mem: 26157 Train: [99] [5500/6250] eta: 0:03:20 lr: 0.000000 grad: 0.2247 (0.3552) loss: 0.7865 (0.7885) time: 0.2665 data: 0.0001 max mem: 26157 Train: [99] [5600/6250] eta: 0:02:53 lr: 0.000000 grad: 0.2480 (0.3557) loss: 0.7850 (0.7885) time: 0.2696 data: 0.0002 max mem: 26157 Train: [99] [5700/6250] eta: 0:02:27 lr: 0.000000 grad: 0.2208 (0.3551) loss: 0.7837 (0.7884) time: 0.2682 data: 0.0001 max mem: 26157 Train: [99] [5800/6250] eta: 0:02:00 lr: 0.000000 grad: 0.2342 (0.3559) loss: 0.7825 (0.7884) time: 0.2665 data: 0.0001 max mem: 26157 Train: [99] [5900/6250] eta: 0:01:33 lr: 0.000000 grad: 0.2352 (0.3561) loss: 0.7859 (0.7884) time: 0.2672 data: 0.0001 max mem: 26157 Train: [99] [6000/6250] eta: 0:01:06 lr: 0.000000 grad: 0.2309 (0.3556) loss: 0.7849 (0.7884) time: 0.2686 data: 0.0001 max mem: 26157 Train: [99] [6100/6250] eta: 0:00:40 lr: 0.000000 grad: 0.2386 (0.3552) loss: 0.7824 (0.7884) time: 0.2670 data: 0.0002 max mem: 26157 Train: [99] [6200/6250] eta: 0:00:13 lr: 0.000000 grad: 0.2191 (0.3540) loss: 0.7964 (0.7884) time: 0.2666 data: 0.0001 max mem: 26157 Train: [99] [6249/6250] eta: 0:00:00 lr: 0.000000 grad: 0.2338 (0.3564) loss: 0.7902 (0.7884) time: 0.2671 data: 0.0001 max mem: 26157 Train: [99] Total time: 0:27:55 (0.2681 s / it) Averaged stats: lr: 0.000000 grad: 0.2338 (0.3564) loss: 0.7902 (0.7884) Eval (hcp-train-subset): [99] [ 0/62] eta: 0:03:27 loss: 0.7831 (0.7831) time: 3.3519 data: 3.2569 max mem: 26157 Eval (hcp-train-subset): [99] [61/62] eta: 0:00:00 loss: 0.7774 (0.7837) time: 0.0926 data: 0.0103 max mem: 26157 Eval (hcp-train-subset): [99] Total time: 0:00:10 (0.1647 s / it) Averaged stats (hcp-train-subset): loss: 0.7774 (0.7837) Making plots (hcp-train-subset): example=39 Eval (hcp-val): [99] [ 0/62] eta: 0:03:55 loss: 0.8165 (0.8165) time: 3.8029 data: 3.7199 max mem: 26157 Eval (hcp-val): [99] [61/62] eta: 0:00:00 loss: 0.8218 (0.8219) time: 0.0872 data: 0.0050 max mem: 26157 Eval (hcp-val): [99] Total time: 0:00:10 (0.1661 s / it) Averaged stats (hcp-val): loss: 0.8218 (0.8219) Making plots (hcp-val): example=39 saving checkpoint experiments/patch_size/output/patch_size/patch8/pretrain/checkpoint-last.pth done! training time: 2 days, 0:34:40