dianecy commited on Mar 24, 2025

Commit

ea1014e

verified ·

1 Parent(s): 1120a2f

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +38 -0
CGFormer/.gitignore +7 -0
CGFormer/.ipynb_checkpoints/test-checkpoint.py +106 -0
CGFormer/.ipynb_checkpoints/test_mosaic-checkpoint.py +106 -0
CGFormer/LICENSE +21 -0
CGFormer/README.md +56 -0
CGFormer/bash_logs/ACE_filter050.log +480 -0
CGFormer/bash_logs/ACE_filter050_rev.log +528 -0
CGFormer/bash_logs/sanity_node03.log +0 -0
CGFormer/bert/__pycache__/activations.cpython-38.pyc +0 -0
CGFormer/bert/__pycache__/activations.cpython-39.pyc +0 -0
CGFormer/bert/__pycache__/configuration_bert.cpython-38.pyc +0 -0
CGFormer/bert/__pycache__/configuration_bert.cpython-39.pyc +0 -0
CGFormer/bert/__pycache__/configuration_utils.cpython-38.pyc +0 -0
CGFormer/bert/__pycache__/configuration_utils.cpython-39.pyc +0 -0
CGFormer/bert/__pycache__/file_utils.cpython-38.pyc +0 -0
CGFormer/bert/__pycache__/file_utils.cpython-39.pyc +0 -0
CGFormer/bert/__pycache__/generation_utils.cpython-38.pyc +0 -0
CGFormer/bert/__pycache__/generation_utils.cpython-39.pyc +0 -0
CGFormer/bert/__pycache__/modeling_bert.cpython-38.pyc +0 -0
CGFormer/bert/__pycache__/modeling_bert.cpython-39.pyc +0 -0
CGFormer/bert/__pycache__/modeling_utils.cpython-38.pyc +0 -0
CGFormer/bert/__pycache__/modeling_utils.cpython-39.pyc +0 -0
CGFormer/bert/__pycache__/tokenization_bert.cpython-38.pyc +0 -0
CGFormer/bert/__pycache__/tokenization_bert.cpython-39.pyc +0 -0
CGFormer/bert/__pycache__/tokenization_utils.cpython-38.pyc +0 -0
CGFormer/bert/__pycache__/tokenization_utils.cpython-39.pyc +0 -0
CGFormer/bert/__pycache__/tokenization_utils_base.cpython-38.pyc +0 -0
CGFormer/bert/__pycache__/tokenization_utils_base.cpython-39.pyc +0 -0
CGFormer/bert/activations.py +56 -0
CGFormer/bert/configuration_bert.py +143 -0
CGFormer/bert/configuration_utils.py +408 -0
CGFormer/bert/file_utils.py +808 -0
CGFormer/bert/generation_utils.py +993 -0
CGFormer/bert/modeling_bert.py +1569 -0
CGFormer/bert/modeling_utils.py +1268 -0
CGFormer/bert/tokenization_bert.py +545 -0
CGFormer/bert/tokenization_utils.py +723 -0
CGFormer/bert/tokenization_utils_base.py +0 -0
CGFormer/ckpts/swin_base_patch4_window12_384_22k.pth +3 -0
CGFormer/config/config_gref_ace.yaml +63 -0
CGFormer/config/config_mosaic_refcocog_u.yaml +51 -0
CGFormer/config/config_rcc_ace.yaml +63 -0
CGFormer/config/config_rccp_ace.yaml +63 -0
CGFormer/config/config_refzom_ace.yaml +64 -0
CGFormer/config/config_refzom_repro.yaml +62 -0
CGFormer/config/config_refzom_repro_eval.yaml +62 -0
CGFormer/config/impl/config.yaml +53 -0
CGFormer/config/open.yaml +55 -0
CGFormer/config/refcoco_mosaic/config.yaml +59 -0

.gitattributes CHANGED Viewed

@@ -34,3 +34,41 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 RIS-DMMI/refer/evaluation/tokenizer/stanford-corenlp-3.4.1.jar filter=lfs diff=lfs merge=lfs -text

 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 RIS-DMMI/refer/evaluation/tokenizer/stanford-corenlp-3.4.1.jar filter=lfs diff=lfs merge=lfs -text
+CGFormer/external/mmsegmentation/demo/demo.png filter=lfs diff=lfs merge=lfs -text
+CGFormer/external/mmsegmentation/docs/zh_cn/imgs/qq_group_qrcode.jpg filter=lfs diff=lfs merge=lfs -text
+CGFormer/external/mmsegmentation/docs/zh_cn/imgs/zhihu_qrcode.jpg filter=lfs diff=lfs merge=lfs -text
+CGFormer/external/mmsegmentation/projects/medical/2d_image/histopathology/conic2022_seg/conic2022_seg_dataset.png filter=lfs diff=lfs merge=lfs -text
+CGFormer/external/mmsegmentation/resources/3dogs.jpg filter=lfs diff=lfs merge=lfs -text
+CGFormer/external/mmsegmentation/resources/seg_demo.gif filter=lfs diff=lfs merge=lfs -text
+CGFormer/external/mmsegmentation/tests/data/pseudo_loveda_dataset/img_dir/0.png filter=lfs diff=lfs merge=lfs -text
+CGFormer/external/mmsegmentation/tests/data/pseudo_loveda_dataset/img_dir/1.png filter=lfs diff=lfs merge=lfs -text
+CGFormer/external/mmsegmentation/tests/data/pseudo_loveda_dataset/img_dir/2.png filter=lfs diff=lfs merge=lfs -text
+CGFormer/external/mmsegmentation/tests/data/pseudo_potsdam_dataset/img_dir/2_10_0_0_512_512.png filter=lfs diff=lfs merge=lfs -text
+CGFormer/external/mmsegmentation/tests/data/pseudo_refuge_dataset/img_dir/pseudo_g0001.png filter=lfs diff=lfs merge=lfs -text
+CGFormer/external/mmsegmentation/tests/data/pseudo_vaihingen_dataset/img_dir/area1_0_0_512_512.png filter=lfs diff=lfs merge=lfs -text
+CGFormer/image/framework.jpg filter=lfs diff=lfs merge=lfs -text
+CGFormer/wandb/offline-run-20250307_173512-9h2on932/run-9h2on932.wandb filter=lfs diff=lfs merge=lfs -text
+CGFormer/wandb/offline-run-20250307_174303-li5zqatl/run-li5zqatl.wandb filter=lfs diff=lfs merge=lfs -text
+CGFormer/wandb/offline-run-20250307_182402-j7d7o60n/run-j7d7o60n.wandb filter=lfs diff=lfs merge=lfs -text
+CGFormer/wandb/offline-run-20250307_183427-lje8pep7/run-lje8pep7.wandb filter=lfs diff=lfs merge=lfs -text
+CGFormer/wandb/offline-run-20250307_191605-qwg5jc6l/run-qwg5jc6l.wandb filter=lfs diff=lfs merge=lfs -text
+CGFormer/wandb/offline-run-20250307_191652-pdgidm12/run-pdgidm12.wandb filter=lfs diff=lfs merge=lfs -text
+CGFormer/wandb/offline-run-20250307_200613-ikc5v4qd/run-ikc5v4qd.wandb filter=lfs diff=lfs merge=lfs -text
+CGFormer/wandb/offline-run-20250307_201001-i0m64au8/run-i0m64au8.wandb filter=lfs diff=lfs merge=lfs -text
+CGFormer/wandb/offline-run-20250307_210707-ialdzorz/run-ialdzorz.wandb filter=lfs diff=lfs merge=lfs -text
+CGFormer/wandb/offline-run-20250307_211011-2bbev839/run-2bbev839.wandb filter=lfs diff=lfs merge=lfs -text
+CGFormer/wandb/offline-run-20250308_193217-dnb3uu3l/run-dnb3uu3l.wandb filter=lfs diff=lfs merge=lfs -text
+CGFormer/wandb/offline-run-20250308_231240-qhgmf9fk/run-qhgmf9fk.wandb filter=lfs diff=lfs merge=lfs -text
+CGFormer/wandb/offline-run-20250309_115725-5fgrfjdy/run-5fgrfjdy.wandb filter=lfs diff=lfs merge=lfs -text
+CGFormer/wandb/offline-run-20250309_170924-0x06srss/run-0x06srss.wandb filter=lfs diff=lfs merge=lfs -text
+CGFormer/wandb/offline-run-20250309_171616-684omhh0/run-684omhh0.wandb filter=lfs diff=lfs merge=lfs -text
+CGFormer/wandb/offline-run-20250309_171623-3b8sr48c/run-3b8sr48c.wandb filter=lfs diff=lfs merge=lfs -text
+CGFormer/wandb/offline-run-20250309_173234-04u0nc2s/run-04u0nc2s.wandb filter=lfs diff=lfs merge=lfs -text
+CGFormer/wandb/offline-run-20250309_202147-gbujz424/run-gbujz424.wandb filter=lfs diff=lfs merge=lfs -text
+CGFormer/wandb/offline-run-20250309_203311-xfi3d65b/run-xfi3d65b.wandb filter=lfs diff=lfs merge=lfs -text
+CGFormer/wandb/offline-run-20250309_203334-e1fdhljy/run-e1fdhljy.wandb filter=lfs diff=lfs merge=lfs -text
+CGFormer/wandb/offline-run-20250309_205719-wlfh3gyq/run-wlfh3gyq.wandb filter=lfs diff=lfs merge=lfs -text
+CGFormer/wandb/offline-run-20250309_205856-a8l51dy6/run-a8l51dy6.wandb filter=lfs diff=lfs merge=lfs -text
+CGFormer/wandb/offline-run-20250309_212058-k3fcizav/run-k3fcizav.wandb filter=lfs diff=lfs merge=lfs -text
+CGFormer/wandb/offline-run-20250309_212213-a992tfly/run-a992tfly.wandb filter=lfs diff=lfs merge=lfs -text
+CGFormer/wandb/offline-run-20250311_174825-ghnm4ky9/run-ghnm4ky9.wandb filter=lfs diff=lfs merge=lfs -text

CGFormer/.gitignore ADDED Viewed

	@@ -0,0 +1,7 @@

+/exp
+/wandb/**
+**/__pycache__
+/train_open.py
+/.vscode
+config/config.yaml
+config/open.yaml

CGFormer/.ipynb_checkpoints/test-checkpoint.py ADDED Viewed

	@@ -0,0 +1,106 @@

+import argparse
+import os
+import warnings
+import cv2
+import torch
+import torch.nn.parallel
+import torch.utils.data
+from loguru import logger
+import deepspeed
+import utils.config as config
+from engine.engine import inference
+from model import build_segmenter
+from utils.dataset import RefDataset
+from utils.misc import setup_logger
+from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+warnings.filterwarnings("ignore")
+cv2.setNumThreads(0)
+def get_parser():
+    parser = argparse.ArgumentParser(
+        description='Pytorch Referring Expression Segmentation')
+    parser.add_argument('--config',
+                        default='path to xxx.yaml',
+                        type=str,
+                        help='config file')
+    parser.add_argument('--opts',
+                        default=None,
+                        nargs=argparse.REMAINDER,
+                        help='override some settings in the config.')
+    args = parser.parse_args()
+    assert args.config is not None
+    cfg = config.load_cfg_from_cfg_file(args.config)
+    if args.opts is not None:
+        cfg = config.merge_cfg_from_list(cfg, args.opts)
+    return cfg
+@logger.catch
+def main():
+    args = get_parser()
+    args.output_dir = os.path.join(args.output_folder, args.exp_name)
+    if args.visualize:
+        args.vis_dir = os.path.join(args.output_dir, "vis")
+        os.makedirs(args.vis_dir, exist_ok=True)
+    # logger
+    setup_logger(args.output_dir,
+                 distributed_rank=0,
+                 filename="test.log",
+                 mode="a")
+    logger.info(args.test_split)
+    # build dataset & dataloader
+    test_data = RefDataset(lmdb_dir=args.test_lmdb,
+                           mask_dir=args.mask_root,
+                           dataset=args.dataset,
+                           split=args.test_split,
+                           mode='test',
+                           input_size=args.input_size,
+                           word_length=args.word_len,
+                           args=args)
+    test_loader = torch.utils.data.DataLoader(test_data,
+                                              batch_size=1,
+                                              shuffle=False,
+                                              num_workers=1,
+                                              pin_memory=True)
+    # build model
+    model = build_segmenter(args, DDP=False)
+    # deepspeed_config = {
+    #     "kernel_inject": False,
+    #     "dtype": "fp16",
+    #     "enable_cuda_graph": True,
+    #     "checkpoint": f'{args.output_dir}/best_model'
+    # }
+    # logger.info(model)
+    #args.model_dir = os.path.join(args.output_dir, "best_model.pth")
+    if os.path.isdir(args.output_dir):
+        logger.info(f"=> loading checkpoint '{args.output_dir}/best_model'")
+        #checkpoint = torch.load(args.model_dir)
+        #model.module.load_state_dict(checkpoint['model_state_dict'], strict=True)
+        model = load_state_dict_from_zero_checkpoint(model, args.output_dir, tag="best_model").cuda()
+        #model.load_checkpoint(args.output_dir, tag="best_model")
+        logger.info(f"=> loading checkpoint '{args.output_dir}/best_model'")
+    else:
+        raise ValueError(
+            "=> resume failed! no checkpoint found at '{}'. Please check args.resume again!"
+            .format(args.output_dir))
+    # inference
+    inference(test_loader, model, args)
+if __name__ == '__main__':
+    main()

CGFormer/.ipynb_checkpoints/test_mosaic-checkpoint.py ADDED Viewed

	@@ -0,0 +1,106 @@

+import argparse
+import os
+import warnings
+import cv2
+import torch
+import torch.nn.parallel
+import torch.utils.data
+from loguru import logger
+import deepspeed
+import utils.config as config
+from engine.engine import inference
+from model import build_segmenter
+from utils.dataset_mosaic import RefDataset
+from utils.misc import setup_logger
+from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+warnings.filterwarnings("ignore")
+cv2.setNumThreads(0)
+def get_parser():
+    parser = argparse.ArgumentParser(
+        description='Pytorch Referring Expression Segmentation')
+    parser.add_argument('--config',
+                        default='path to xxx.yaml',
+                        type=str,
+                        help='config file')
+    parser.add_argument('--opts',
+                        default=None,
+                        nargs=argparse.REMAINDER,
+                        help='override some settings in the config.')
+    args = parser.parse_args()
+    assert args.config is not None
+    cfg = config.load_cfg_from_cfg_file(args.config)
+    if args.opts is not None:
+        cfg = config.merge_cfg_from_list(cfg, args.opts)
+    return cfg
+@logger.catch
+def main():
+    args = get_parser()
+    args.output_dir = os.path.join(args.output_folder, args.exp_name)
+    if args.visualize:
+        args.vis_dir = os.path.join(args.output_dir, "vis")
+        os.makedirs(args.vis_dir, exist_ok=True)
+    # logger
+    setup_logger(args.output_dir,
+                 distributed_rank=0,
+                 filename="test.log",
+                 mode="a")
+    logger.info(args.test_split)
+    # build dataset & dataloader
+    test_data = RefDataset(lmdb_dir=args.test_lmdb,
+                           mask_dir=args.mask_root,
+                           dataset=args.dataset,
+                           split=args.test_split,
+                           mode='test',
+                           input_size=args.input_size,
+                           word_length=args.word_len,
+                           args=args)
+    test_loader = torch.utils.data.DataLoader(test_data,
+                                              batch_size=1,
+                                              shuffle=False,
+                                              num_workers=1,
+                                              pin_memory=True)
+    # build model
+    model = build_segmenter(args, DDP=False)
+    # deepspeed_config = {
+    #     "kernel_inject": False,
+    #     "dtype": "fp16",
+    #     "enable_cuda_graph": True,
+    #     "checkpoint": f'{args.output_dir}/best_model'
+    # }
+    # logger.info(model)
+    #args.model_dir = os.path.join(args.output_dir, "best_model.pth")
+    if os.path.isdir(args.output_dir):
+        logger.info(f"=> loading checkpoint '{args.output_dir}/best_model'")
+        #checkpoint = torch.load(args.model_dir)
+        #model.module.load_state_dict(checkpoint['model_state_dict'], strict=True)
+        model = load_state_dict_from_zero_checkpoint(model, args.output_dir, tag="best_model").cuda()
+        #model.load_checkpoint(args.output_dir, tag="best_model")
+        logger.info(f"=> loading checkpoint '{args.output_dir}/best_model'")
+    else:
+        raise ValueError(
+            "=> resume failed! no checkpoint found at '{}'. Please check args.resume again!"
+            .format(args.output_dir))
+    # inference
+    inference(test_loader, model, args)
+if __name__ == '__main__':
+    main()

CGFormer/LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2023 Toneyaya
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

CGFormer/README.md ADDED Viewed

	@@ -0,0 +1,56 @@

+# CGFormer
+The official PyTorch implementation of the CVPR 2023 paper "Contrastive Grouping with Transformer for Referring Image Segmentation".
+This paper first introduces learnable query tokens to represent objects and then alternately queries linguistic features and groups visual features into the query tokens for object-aware cross-modal reasoning. CGFormer achieves cross-level interaction by jointly updating the query tokens and decoding masks in every two consecutive layers. In addition, we introduce new splits on datasets for evaluating generalization for referring image segmentation models.
+## Framework
+<p align="center">
+  <img src="image/framework.jpg" width="1000">
+</p>
+## Preparation
+1. Environment
+   - [PyTorch](www.pytorch.org)
+   - Other dependencies in `requirements.txt`
+2. Datasets
+   - The detailed instruction is in [prepare_datasets](data/READEME.md)
+3. Pretrained weights
+   - [Swin-Base-window12](https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window12_384_22k.pth)
+## Train and Test (RIS)
+This implementation only supports **multi-gpu**, **DistributedDataParallel** training, which is faster and simpler; single-gpu or DataParallel training is not supported. Besides, the evaluation only supports single-gpu mode.
+To do training of CGFormer with 8 GPUs, run:
+```
+python -u train.py --config config/config.yaml
+```
+To do evaluation of CGFormer with 1 GPU, run:
+```
+CUDA_VISIBLE_DEVICES=0 python -u test.py \
+      --config config/refcoco/config.yaml \
+      --opts TEST.test_split val \
+             TEST.test_lmdb path/val.lmdb
+```
+## License
+This project is under the MIT license. See [LICENSE](LICENSE) for details.
+## Citation
+If you find our work useful in your research, please consider citing:
+```
+@InProceedings{Tang_2023_CVPR,
+    author    = {Tang, Jiajin and Zheng, Ge and Shi, Cheng and Yang, Sibei},
+    title     = {Contrastive Grouping With Transformer for Referring Image Segmentation},
+    booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
+    month     = {June},
+    year      = {2023},
+    pages     = {23570-23580}
+}
+```
+Some code changes come from [CRIS](https://github.com/DerrickWang005/CRIS.pytorch/tree/master) and [LAVT](https://github.com/yz93/LAVT-RIS).

CGFormer/bash_logs/ACE_filter050.log ADDED Viewed

	@@ -0,0 +1,480 @@

+/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/torch/distributed/launch.py:181: FutureWarning: The module torch.distributed.launch is deprecated
+and will be removed in future. Use torchrun.
+Note that --use-env is set by default in torchrun.
+If your script expects `--local-rank` argument to be set, please
+change it to read from `os.environ['LOCAL_RANK']` instead. See
+https://pytorch.org/docs/stable/distributed.html#launch-utility for
+further instructions
+  warnings.warn(
+[2025-03-03 00:25:13,383] torch.distributed.run: [WARNING]
+[2025-03-03 00:25:13,383] torch.distributed.run: [WARNING] *****************************************
+[2025-03-03 00:25:13,383] torch.distributed.run: [WARNING] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
+[2025-03-03 00:25:13,383] torch.distributed.run: [WARNING] *****************************************
+/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/albumentations/__init__.py:24: UserWarning: A new version of Albumentations is available: 2.0.5 (you have 1.4.24). Upgrade using: pip install -U albumentations. To disable automatic update checks, set the environment variable NO_ALBUMENTATIONS_UPDATE to 1.
+  check_for_updates()
+/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/albumentations/__init__.py:24: UserWarning: A new version of Albumentations is available: 2.0.5 (you have 1.4.24). Upgrade using: pip install -U albumentations. To disable automatic update checks, set the environment variable NO_ALBUMENTATIONS_UPDATE to 1.
+  check_for_updates()
+/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/albumentations/__init__.py:24: UserWarning: A new version of Albumentations is available: 2.0.5 (you have 1.4.24). Upgrade using: pip install -U albumentations. To disable automatic update checks, set the environment variable NO_ALBUMENTATIONS_UPDATE to 1.
+  check_for_updates()
+/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/albumentations/__init__.py:24: UserWarning: A new version of Albumentations is available: 2.0.5 (you have 1.4.24). Upgrade using: pip install -U albumentations. To disable automatic update checks, set the environment variable NO_ALBUMENTATIONS_UPDATE to 1.
+  check_for_updates()
+/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/timm/models/layers/__init__.py:48: FutureWarning: Importing from timm.models.layers is deprecated, please import via timm.layers
+  warnings.warn(f"Importing from {__name__} is deprecated, please import via timm.layers", FutureWarning)
+/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/timm/models/layers/__init__.py:48: FutureWarning: Importing from timm.models.layers is deprecated, please import via timm.layers
+  warnings.warn(f"Importing from {__name__} is deprecated, please import via timm.layers", FutureWarning)
+/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/timm/models/layers/__init__.py:48: FutureWarning: Importing from timm.models.layers is deprecated, please import via timm.layers
+  warnings.warn(f"Importing from {__name__} is deprecated, please import via timm.layers", FutureWarning)
+/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/timm/models/layers/__init__.py:48: FutureWarning: Importing from timm.models.layers is deprecated, please import via timm.layers
+  warnings.warn(f"Importing from {__name__} is deprecated, please import via timm.layers", FutureWarning)
+2025-03-03 00:25:21.347 | INFO     | __main__:main:66 - LOCAL_RANK from env: 1
+2025-03-03 00:25:21.347 | INFO     | __main__:main:66 - LOCAL_RANK from env: 0
+2025-03-03 00:25:21.359 | INFO     | __main__:main:66 - LOCAL_RANK from env: 3
+2025-03-03 00:25:21.369 | INFO     | __main__:main:66 - LOCAL_RANK from env: 2
+2025-03-03 00:25:21 | INFO     | __main__:90 - Starting with GPU: 0, Rank: 0, World Size: 4
+git root error: Cmd('git') failed due to: exit code(128)
+  cmdline: git rev-parse --show-toplevel
+  stderr: 'fatal: detected dubious ownership in repository at '/data2/projects/chaeyun/CGFormer'
+To add an exception for this directory, call:
+	git config --global --add safe.directory /data2/projects/chaeyun/CGFormer'
+wandb: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
+wandb: Tracking run with wandb version 0.19.1
+wandb: W&B syncing is set to `offline` in this directory.
+wandb: Run `wandb online` or set WANDB_MODE=online to enable cloud syncing.
+node03:2017036:2017036 [0] NCCL INFO Bootstrap : Using eth2:10.1.10.3<0>
+node03:2017036:2017036 [0] NCCL INFO NET/Plugin : Plugin load (libnccl-net.so) returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory
+node03:2017036:2017036 [0] NCCL INFO NET/Plugin : No plugin found, using internal implementation
+node03:2017036:2017036 [0] NCCL INFO cudaDriverVersion 12070
+NCCL version 2.18.5+cuda11.8
+node03:2017039:2017039 [3] NCCL INFO cudaDriverVersion 12070
+node03:2017038:2017038 [2] NCCL INFO cudaDriverVersion 12070
+node03:2017037:2017037 [1] NCCL INFO cudaDriverVersion 12070
+node03:2017038:2017038 [2] NCCL INFO Bootstrap : Using eth2:10.1.10.3<0>
+node03:2017039:2017039 [3] NCCL INFO Bootstrap : Using eth2:10.1.10.3<0>
+node03:2017037:2017037 [1] NCCL INFO Bootstrap : Using eth2:10.1.10.3<0>
+node03:2017038:2017038 [2] NCCL INFO NET/Plugin : Plugin load (libnccl-net.so) returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory
+node03:2017038:2017038 [2] NCCL INFO NET/Plugin : No plugin found, using internal implementation
+node03:2017039:2017039 [3] NCCL INFO NET/Plugin : Plugin load (libnccl-net.so) returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory
+node03:2017039:2017039 [3] NCCL INFO NET/Plugin : No plugin found, using internal implementation
+node03:2017037:2017037 [1] NCCL INFO NET/Plugin : Plugin load (libnccl-net.so) returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory
+node03:2017037:2017037 [1] NCCL INFO NET/Plugin : No plugin found, using internal implementation
+node03:2017039:2017152 [3] NCCL INFO NET/IB : No device found.
+node03:2017039:2017152 [3] NCCL INFO NET/Socket : Using [0]eth2:10.1.10.3<0>
+node03:2017039:2017152 [3] NCCL INFO Using network Socket
+node03:2017038:2017151 [2] NCCL INFO NET/IB : No device found.
+node03:2017038:2017151 [2] NCCL INFO NET/Socket : Using [0]eth2:10.1.10.3<0>
+node03:2017038:2017151 [2] NCCL INFO Using network Socket
+node03:2017036:2017153 [0] NCCL INFO NET/IB : No device found.
+node03:2017037:2017154 [1] NCCL INFO NET/IB : No device found.
+node03:2017036:2017153 [0] NCCL INFO NET/Socket : Using [0]eth2:10.1.10.3<0>
+node03:2017036:2017153 [0] NCCL INFO Using network Socket
+node03:2017037:2017154 [1] NCCL INFO NET/Socket : Using [0]eth2:10.1.10.3<0>
+node03:2017037:2017154 [1] NCCL INFO Using network Socket
+node03:2017038:2017151 [2] NCCL INFO comm 0xaa0bbe0 rank 2 nranks 4 cudaDev 2 nvmlDev 2 busId 14000 commId 0x190df8e791fecf1b - Init START
+node03:2017037:2017154 [1] NCCL INFO comm 0xadc9fc0 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId 13000 commId 0x190df8e791fecf1b - Init START
+node03:2017039:2017152 [3] NCCL INFO comm 0xb009870 rank 3 nranks 4 cudaDev 3 nvmlDev 3 busId 48000 commId 0x190df8e791fecf1b - Init START
+node03:2017036:2017153 [0] NCCL INFO comm 0xb555ea0 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 12000 commId 0x190df8e791fecf1b - Init START
+node03:2017037:2017154 [1] NCCL INFO Setting affinity for GPU 1 to 5500,00000055
+node03:2017038:2017151 [2] NCCL INFO Setting affinity for GPU 2 to 5500,00000055
+node03:2017039:2017152 [3] NCCL INFO Setting affinity for GPU 3 to 5500,00000055
+node03:2017036:2017153 [0] NCCL INFO Setting affinity for GPU 0 to 5500,00000055
+node03:2017039:2017152 [3] NCCL INFO Trees [0] -1/-1/-1->3->2 [1] -1/-1/-1->3->2
+node03:2017036:2017153 [0] NCCL INFO Channel 00/02 :    0   1   2   3
+node03:2017038:2017151 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1
+node03:2017037:2017154 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0
+node03:2017039:2017152 [3] NCCL INFO P2P Chunksize set to 131072
+node03:2017036:2017153 [0] NCCL INFO Channel 01/02 :    0   1   2   3
+node03:2017038:2017151 [2] NCCL INFO P2P Chunksize set to 131072
+node03:2017037:2017154 [1] NCCL INFO P2P Chunksize set to 131072
+node03:2017036:2017153 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1
+node03:2017036:2017153 [0] NCCL INFO P2P Chunksize set to 131072
+node03:2017037:2017154 [1] NCCL INFO Channel 00/0 : 1[1] -> 2[2] via P2P/IPC
+node03:2017038:2017151 [2] NCCL INFO Channel 00 : 2[2] -> 3[3] via SHM/direct/direct
+node03:2017037:2017154 [1] NCCL INFO Channel 01/0 : 1[1] -> 2[2] via P2P/IPC
+node03:2017038:2017151 [2] NCCL INFO Channel 01 : 2[2] -> 3[3] via SHM/direct/direct
+node03:2017036:2017153 [0] NCCL INFO Channel 00/0 : 0[0] -> 1[1] via P2P/IPC
+node03:2017039:2017152 [3] NCCL INFO Channel 00 : 3[3] -> 0[0] via SHM/direct/direct
+node03:2017039:2017152 [3] NCCL INFO Channel 01 : 3[3] -> 0[0] via SHM/direct/direct
+node03:2017036:2017153 [0] NCCL INFO Channel 01/0 : 0[0] -> 1[1] via P2P/IPC
+node03:2017037:2017154 [1] NCCL INFO Connected all rings
+node03:2017036:2017153 [0] NCCL INFO Connected all rings
+node03:2017038:2017151 [2] NCCL INFO Connected all rings
+node03:2017039:2017152 [3] NCCL INFO Connected all rings
+node03:2017037:2017154 [1] NCCL INFO Channel 00/0 : 1[1] -> 0[0] via P2P/IPC
+node03:2017039:2017152 [3] NCCL INFO Channel 00 : 3[3] -> 2[2] via SHM/direct/direct
+node03:2017037:2017154 [1] NCCL INFO Channel 01/0 : 1[1] -> 0[0] via P2P/IPC
+node03:2017039:2017152 [3] NCCL INFO Channel 01 : 3[3] -> 2[2] via SHM/direct/direct
+node03:2017036:2017153 [0] NCCL INFO Connected all trees
+node03:2017036:2017153 [0] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
+node03:2017036:2017153 [0] NCCL INFO 2 coll channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
+node03:2017038:2017151 [2] NCCL INFO Channel 00/0 : 2[2] -> 1[1] via P2P/IPC
+node03:2017038:2017151 [2] NCCL INFO Channel 01/0 : 2[2] -> 1[1] via P2P/IPC
+node03:2017037:2017154 [1] NCCL INFO Connected all trees
+node03:2017037:2017154 [1] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
+node03:2017037:2017154 [1] NCCL INFO 2 coll channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
+node03:2017038:2017151 [2] NCCL INFO Connected all trees
+node03:2017038:2017151 [2] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
+node03:2017038:2017151 [2] NCCL INFO 2 coll channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
+node03:2017039:2017152 [3] NCCL INFO Connected all trees
+node03:2017039:2017152 [3] NCCL INFO threadThresholds 8/8/64 | 32/8/64 | 512 | 512
+node03:2017039:2017152 [3] NCCL INFO 2 coll channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
+node03:2017039:2017152 [3] NCCL INFO comm 0xb009870 rank 3 nranks 4 cudaDev 3 nvmlDev 3 busId 48000 commId 0x190df8e791fecf1b - Init COMPLETE
+node03:2017037:2017154 [1] NCCL INFO comm 0xadc9fc0 rank 1 nranks 4 cudaDev 1 nvmlDev 1 busId 13000 commId 0x190df8e791fecf1b - Init COMPLETE
+node03:2017038:2017151 [2] NCCL INFO comm 0xaa0bbe0 rank 2 nranks 4 cudaDev 2 nvmlDev 2 busId 14000 commId 0x190df8e791fecf1b - Init COMPLETE
+node03:2017036:2017153 [0] NCCL INFO comm 0xb555ea0 rank 0 nranks 4 cudaDev 0 nvmlDev 0 busId 12000 commId 0x190df8e791fecf1b - Init COMPLETE
+2025-03-03 00:25:23 | INFO     | model:31 - Window size 12!
+2025-03-03 00:25:24 | INFO     | model:51 - Initializing Multi-modal Swin Transformer weights from ckpts/swin_base_patch4_window12_384_22k.pth
+2025-03-03 00:25:25 | INFO     | model.backbone:459 - loading swin success !!!
+2025-03-03 00:25:29 | INFO     | __main__:144 - Model moved to GPU: 0
+2025-03-03 00:25:29 | INFO     | __main__:145 - amsgrad: True
+batch_size: 24
+batch_size_val: 16
+bert: bert-base-uncased
+dataset: refcocog_u
+dist_backend: nccl
+dropout: 0.0
+epochs: 50
+evaluate: True
+exclude_multiobj: True
+exp_name: ACE_filter050
+filter_threshold: 0.5
+fusion_drop: 0.0
+gpu: 0
+hp_selection: strict
+input_size: 480
+local_rank: 0
+loss_option: ACE_verbonly
+lr: 0.0001
+lr_backbone: 5e-05
+lr_text_encoder: 5e-05
+manual_seed: 2051388757
+margin_value: 12
+mask_root: data/masks/refcocog_u
+metric_learning: True
+metric_loss_weight: 0.1
+metric_mode: hardpos_only_sbertsim_refined
+mha: 8-8-8-8
+mixup_lasttwo: True
+num_token: 2
+output_dir: exp/refcoco_u/ACE_filter050
+output_folder: exp/refcoco_u
+print_freq: 100
+rank: 0
+resume: None
+save_freq: 1
+start_epoch: 0
+swin_pretrain: ckpts/swin_base_patch4_window12_384_22k.pth
+swin_type: base
+sync_bn: True
+temperature: 0.07
+test_lmdb: data/lmdb/refcocog_u/test.lmdb
+test_split: test
+token_dim: 512
+train_lmdb: data/lmdb/refcocog_u/train.lmdb
+train_split: train
+val_lmdb: data/lmdb/refcocog_u/val.lmdb
+val_split: val
+vis_dim: 512
+visualize: False
+weight: None
+weight_decay: 0.0001
+window12: True
+word_dim: 768
+word_len: 20
+workers: 32
+workers_val: 8
+world_size: 4
+2025-03-03 00:28:05 | INFO     | utils.misc:108 - Training: Epoch=[1/50] [ 100/1759]  Batch=1.34 (1.53)  Data=0.00 (0.09)  Lr=0.000100  Loss=1.1470 (1.1906)  IoU=15.42 (19.69)  Prec@50=0.00 (8.61)
+2025-03-03 00:30:31 | INFO     | utils.misc:108 - Training: Epoch=[1/50] [ 200/1759]  Batch=1.54 (1.50)  Data=0.00 (0.06)  Lr=0.000100  Loss=1.1851 (1.1191)  IoU=24.54 (23.36)  Prec@50=12.77 (12.84)
+2025-03-03 00:32:56 | INFO     | utils.misc:108 - Training: Epoch=[1/50] [ 300/1759]  Batch=1.23 (1.48)  Data=0.00 (0.05)  Lr=0.000100  Loss=0.9150 (1.0781)  IoU=33.89 (25.72)  Prec@50=29.46 (15.38)
+2025-03-03 00:35:23 | INFO     | utils.misc:108 - Training: Epoch=[1/50] [ 400/1759]  Batch=1.47 (1.48)  Data=0.00 (0.04)  Lr=0.000100  Loss=0.9083 (1.0518)  IoU=31.66 (26.80)  Prec@50=18.33 (16.54)
+2025-03-03 00:37:51 | INFO     | utils.misc:108 - Training: Epoch=[1/50] [ 500/1759]  Batch=1.51 (1.48)  Data=0.00 (0.04)  Lr=0.000100  Loss=0.9000 (1.0296)  IoU=32.92 (27.76)  Prec@50=22.46 (17.54)
+2025-03-03 00:40:19 | INFO     | utils.misc:108 - Training: Epoch=[1/50] [ 600/1759]  Batch=1.31 (1.48)  Data=0.00 (0.04)  Lr=0.000100  Loss=0.8722 (1.0109)  IoU=40.11 (28.53)  Prec@50=26.04 (18.47)
+2025-03-03 00:42:47 | INFO     | utils.misc:108 - Training: Epoch=[1/50] [ 700/1759]  Batch=1.56 (1.48)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.8722 (0.9958)  IoU=38.67 (29.04)  Prec@50=33.12 (19.23)
+2025-03-03 00:45:15 | INFO     | utils.misc:108 - Training: Epoch=[1/50] [ 800/1759]  Batch=1.21 (1.48)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.9943 (0.9830)  IoU=31.87 (29.65)  Prec@50=22.32 (20.11)
+2025-03-03 00:47:43 | INFO     | utils.misc:108 - Training: Epoch=[1/50] [ 900/1759]  Batch=1.35 (1.48)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.8947 (0.9714)  IoU=36.31 (30.10)  Prec@50=25.10 (20.81)
+2025-03-03 00:50:09 | INFO     | utils.misc:108 - Training: Epoch=[1/50] [1000/1759]  Batch=1.77 (1.48)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.8877 (0.9635)  IoU=33.68 (30.41)  Prec@50=25.49 (21.28)
+2025-03-03 00:52:36 | INFO     | utils.misc:108 - Training: Epoch=[1/50] [1100/1759]  Batch=1.59 (1.48)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.7724 (0.9548)  IoU=38.99 (30.85)  Prec@50=36.88 (21.78)
+2025-03-03 00:55:03 | INFO     | utils.misc:108 - Training: Epoch=[1/50] [1200/1759]  Batch=1.45 (1.48)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.7028 (0.9458)  IoU=47.27 (31.19)  Prec@50=49.79 (22.26)
+2025-03-03 00:57:31 | INFO     | utils.misc:108 - Training: Epoch=[1/50] [1300/1759]  Batch=1.42 (1.48)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.7939 (0.9381)  IoU=39.03 (31.59)  Prec@50=25.15 (22.68)
+2025-03-03 00:59:59 | INFO     | utils.misc:108 - Training: Epoch=[1/50] [1400/1759]  Batch=1.27 (1.48)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.7186 (0.9315)  IoU=40.97 (31.96)  Prec@50=35.71 (23.16)
+2025-03-03 01:02:23 | INFO     | utils.misc:108 - Training: Epoch=[1/50] [1500/1759]  Batch=1.80 (1.47)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.8415 (0.9243)  IoU=33.69 (32.31)  Prec@50=23.39 (23.62)
+2025-03-03 01:04:50 | INFO     | utils.misc:108 - Training: Epoch=[1/50] [1600/1759]  Batch=2.03 (1.47)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.8356 (0.9182)  IoU=35.39 (32.66)  Prec@50=21.51 (24.10)
+2025-03-03 01:07:15 | INFO     | utils.misc:108 - Training: Epoch=[1/50] [1700/1759]  Batch=1.40 (1.47)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.7456 (0.9132)  IoU=41.80 (33.01)  Prec@50=30.85 (24.57)
+2025-03-03 01:09:16 | INFO     | engine.engine_gref:166 - Evaluation: Epoch=[1/50]  mIoU=43.35  oIoU=42.54  Pr@50: 39.60  Pr@60: 28.96  Pr@70: 18.63  Pr@80: 10.71  Pr@90: 3.11
+2025-03-03 01:12:09 | INFO     | utils.misc:108 - Training: Epoch=[2/50] [ 100/1759]  Batch=1.30 (1.47)  Data=0.00 (0.04)  Lr=0.000100  Loss=0.7384 (0.7771)  IoU=36.20 (40.67)  Prec@50=25.00 (35.48)
+2025-03-03 01:14:38 | INFO     | utils.misc:108 - Training: Epoch=[2/50] [ 200/1759]  Batch=1.28 (1.48)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.7371 (0.7694)  IoU=41.03 (40.41)  Prec@50=33.33 (35.67)
+2025-03-03 01:17:04 | INFO     | utils.misc:108 - Training: Epoch=[2/50] [ 300/1759]  Batch=1.42 (1.48)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.7440 (0.7664)  IoU=37.40 (40.49)  Prec@50=29.86 (36.04)
+2025-03-03 01:19:35 | INFO     | utils.misc:108 - Training: Epoch=[2/50] [ 400/1759]  Batch=1.50 (1.48)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.7437 (0.7650)  IoU=35.39 (40.39)  Prec@50=33.61 (36.12)
+2025-03-03 01:22:01 | INFO     | utils.misc:108 - Training: Epoch=[2/50] [ 500/1759]  Batch=1.61 (1.48)  Data=0.01 (0.03)  Lr=0.000100  Loss=0.8122 (0.7668)  IoU=40.75 (40.34)  Prec@50=39.79 (36.16)
+2025-03-03 01:24:25 | INFO     | utils.misc:108 - Training: Epoch=[2/50] [ 600/1759]  Batch=1.37 (1.47)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.7956 (0.7655)  IoU=35.37 (40.32)  Prec@50=29.86 (36.20)
+2025-03-03 01:26:51 | INFO     | utils.misc:108 - Training: Epoch=[2/50] [ 700/1759]  Batch=1.30 (1.47)  Data=0.01 (0.03)  Lr=0.000100  Loss=0.7419 (0.7664)  IoU=43.77 (40.30)  Prec@50=37.50 (36.12)
+2025-03-03 01:29:16 | INFO     | utils.misc:108 - Training: Epoch=[2/50] [ 800/1759]  Batch=1.22 (1.47)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.7913 (0.7653)  IoU=42.33 (40.31)  Prec@50=45.83 (36.31)
+2025-03-03 01:31:42 | INFO     | utils.misc:108 - Training: Epoch=[2/50] [ 900/1759]  Batch=1.67 (1.47)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.8527 (0.7637)  IoU=32.06 (40.55)  Prec@50=30.27 (36.72)
+2025-03-03 01:34:07 | INFO     | utils.misc:108 - Training: Epoch=[2/50] [1000/1759]  Batch=1.41 (1.47)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.8082 (0.7611)  IoU=40.24 (40.63)  Prec@50=29.76 (36.89)
+2025-03-03 01:36:33 | INFO     | utils.misc:108 - Training: Epoch=[2/50] [1100/1759]  Batch=1.27 (1.46)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.8084 (0.7607)  IoU=38.52 (40.66)  Prec@50=31.25 (36.92)
+2025-03-03 01:38:57 | INFO     | utils.misc:108 - Training: Epoch=[2/50] [1200/1759]  Batch=1.47 (1.46)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.6577 (0.7588)  IoU=41.28 (40.73)  Prec@50=39.49 (37.16)
+2025-03-03 01:41:22 | INFO     | utils.misc:108 - Training: Epoch=[2/50] [1300/1759]  Batch=1.32 (1.46)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.7004 (0.7568)  IoU=43.16 (40.84)  Prec@50=37.05 (37.41)
+2025-03-03 01:43:49 | INFO     | utils.misc:108 - Training: Epoch=[2/50] [1400/1759]  Batch=1.23 (1.46)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.7302 (0.7556)  IoU=50.00 (40.88)  Prec@50=54.76 (37.47)
+2025-03-03 01:46:16 | INFO     | utils.misc:108 - Training: Epoch=[2/50] [1500/1759]  Batch=1.36 (1.46)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.8330 (0.7550)  IoU=39.82 (40.90)  Prec@50=27.08 (37.47)
+2025-03-03 01:48:44 | INFO     | utils.misc:108 - Training: Epoch=[2/50] [1600/1759]  Batch=1.76 (1.46)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.7036 (0.7541)  IoU=44.15 (40.92)  Prec@50=44.38 (37.50)
+2025-03-03 01:51:09 | INFO     | utils.misc:108 - Training: Epoch=[2/50] [1700/1759]  Batch=1.50 (1.46)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.6859 (0.7531)  IoU=42.78 (41.05)  Prec@50=39.23 (37.69)
+2025-03-03 01:53:12 | INFO     | engine.engine_gref:166 - Evaluation: Epoch=[2/50]  mIoU=49.64  oIoU=46.67  Pr@50: 47.79  Pr@60: 37.85  Pr@70: 29.00  Pr@80: 18.63  Pr@90: 6.25
+2025-03-03 01:56:07 | INFO     | utils.misc:108 - Training: Epoch=[3/50] [ 100/1759]  Batch=1.46 (1.48)  Data=0.00 (0.04)  Lr=0.000100  Loss=0.6057 (0.6714)  IoU=51.61 (44.71)  Prec@50=54.41 (44.64)
+2025-03-03 01:58:35 | INFO     | utils.misc:108 - Training: Epoch=[3/50] [ 200/1759]  Batch=1.53 (1.48)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.9918 (0.6685)  IoU=29.66 (44.32)  Prec@50=33.96 (43.73)
+2025-03-03 02:01:02 | INFO     | utils.misc:108 - Training: Epoch=[3/50] [ 300/1759]  Batch=1.31 (1.47)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.6285 (0.6760)  IoU=49.97 (44.40)  Prec@50=53.97 (43.81)
+2025-03-03 02:03:29 | INFO     | utils.misc:108 - Training: Epoch=[3/50] [ 400/1759]  Batch=1.83 (1.48)  Data=0.01 (0.03)  Lr=0.000100  Loss=0.8127 (0.6792)  IoU=33.49 (44.24)  Prec@50=30.44 (43.48)
+2025-03-03 02:05:57 | INFO     | utils.misc:108 - Training: Epoch=[3/50] [ 500/1759]  Batch=1.61 (1.48)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.6658 (0.6781)  IoU=43.32 (44.19)  Prec@50=49.29 (43.60)
+2025-03-03 02:08:23 | INFO     | utils.misc:108 - Training: Epoch=[3/50] [ 600/1759]  Batch=1.67 (1.47)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.6875 (0.6804)  IoU=45.37 (43.86)  Prec@50=38.48 (43.05)
+2025-03-03 02:10:51 | INFO     | utils.misc:108 - Training: Epoch=[3/50] [ 700/1759]  Batch=1.31 (1.47)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.6716 (0.6817)  IoU=49.63 (43.89)  Prec@50=45.83 (42.92)
+2025-03-03 02:13:18 | INFO     | utils.misc:108 - Training: Epoch=[3/50] [ 800/1759]  Batch=1.31 (1.47)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.8302 (0.6830)  IoU=40.04 (44.00)  Prec@50=37.35 (43.08)
+2025-03-03 02:15:43 | INFO     | utils.misc:108 - Training: Epoch=[3/50] [ 900/1759]  Batch=1.39 (1.47)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.8283 (0.6843)  IoU=36.38 (44.04)  Prec@50=28.97 (43.11)
+2025-03-03 02:18:09 | INFO     | utils.misc:108 - Training: Epoch=[3/50] [1000/1759]  Batch=1.41 (1.47)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.6356 (0.6843)  IoU=48.44 (43.85)  Prec@50=50.15 (42.86)
+2025-03-03 02:20:34 | INFO     | utils.misc:108 - Training: Epoch=[3/50] [1100/1759]  Batch=1.86 (1.47)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.5373 (0.6842)  IoU=44.84 (43.78)  Prec@50=50.40 (42.74)
+2025-03-03 02:23:04 | INFO     | utils.misc:108 - Training: Epoch=[3/50] [1200/1759]  Batch=1.41 (1.47)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.6934 (0.6824)  IoU=43.73 (43.80)  Prec@50=42.86 (42.84)
+2025-03-03 02:25:31 | INFO     | utils.misc:108 - Training: Epoch=[3/50] [1300/1759]  Batch=1.38 (1.47)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.8082 (0.6824)  IoU=44.25 (43.82)  Prec@50=43.25 (42.90)
+2025-03-03 02:27:58 | INFO     | utils.misc:108 - Training: Epoch=[3/50] [1400/1759]  Batch=1.77 (1.47)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.7234 (0.6808)  IoU=37.74 (43.89)  Prec@50=31.61 (42.97)
+2025-03-03 02:30:25 | INFO     | utils.misc:108 - Training: Epoch=[3/50] [1500/1759]  Batch=1.36 (1.47)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.6207 (0.6785)  IoU=52.28 (44.04)  Prec@50=49.60 (43.19)
+2025-03-03 02:32:52 | INFO     | utils.misc:108 - Training: Epoch=[3/50] [1600/1759]  Batch=1.38 (1.47)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.5630 (0.6783)  IoU=50.30 (44.12)  Prec@50=51.19 (43.27)
+2025-03-03 02:35:20 | INFO     | utils.misc:108 - Training: Epoch=[3/50] [1700/1759]  Batch=1.35 (1.47)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.6715 (0.6784)  IoU=39.24 (44.26)  Prec@50=39.93 (43.41)
+2025-03-03 02:37:23 | INFO     | engine.engine_gref:166 - Evaluation: Epoch=[3/50]  mIoU=52.86  oIoU=49.37  Pr@50: 54.19  Pr@60: 45.54  Pr@70: 36.26  Pr@80: 25.82  Pr@90: 10.52
+2025-03-03 02:40:20 | INFO     | utils.misc:108 - Training: Epoch=[4/50] [ 100/1759]  Batch=1.27 (1.50)  Data=0.00 (0.04)  Lr=0.000100  Loss=0.6958 (0.6286)  IoU=44.29 (46.25)  Prec@50=38.39 (45.99)
+2025-03-03 02:42:47 | INFO     | utils.misc:108 - Training: Epoch=[4/50] [ 200/1759]  Batch=1.30 (1.48)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.6073 (0.6325)  IoU=46.17 (46.22)  Prec@50=52.38 (45.96)
+2025-03-03 02:45:16 | INFO     | utils.misc:108 - Training: Epoch=[4/50] [ 300/1759]  Batch=1.32 (1.49)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.5563 (0.6330)  IoU=52.67 (46.34)  Prec@50=59.87 (45.94)
+2025-03-03 02:47:42 | INFO     | utils.misc:108 - Training: Epoch=[4/50] [ 400/1759]  Batch=1.30 (1.48)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.6527 (0.6364)  IoU=52.90 (46.53)  Prec@50=59.38 (46.17)
+2025-03-03 02:50:08 | INFO     | utils.misc:108 - Training: Epoch=[4/50] [ 500/1759]  Batch=1.28 (1.48)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.7021 (0.6343)  IoU=49.21 (46.89)  Prec@50=42.71 (46.54)
+2025-03-03 02:52:34 | INFO     | utils.misc:108 - Training: Epoch=[4/50] [ 600/1759]  Batch=1.24 (1.47)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.7423 (0.6356)  IoU=39.62 (46.82)  Prec@50=36.61 (46.63)
+2025-03-03 02:55:01 | INFO     | utils.misc:108 - Training: Epoch=[4/50] [ 700/1759]  Batch=1.34 (1.47)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.7147 (0.6425)  IoU=42.98 (46.63)  Prec@50=37.15 (46.32)
+2025-03-03 02:57:30 | INFO     | utils.misc:108 - Training: Epoch=[4/50] [ 800/1759]  Batch=1.77 (1.48)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.5617 (0.6499)  IoU=42.31 (46.31)  Prec@50=36.67 (45.86)
+2025-03-03 02:59:58 | INFO     | utils.misc:108 - Training: Epoch=[4/50] [ 900/1759]  Batch=1.70 (1.48)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.6406 (0.6511)  IoU=46.95 (46.46)  Prec@50=45.00 (46.10)
+2025-03-03 03:02:23 | INFO     | utils.misc:108 - Training: Epoch=[4/50] [1000/1759]  Batch=1.28 (1.47)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.5956 (0.6502)  IoU=47.62 (46.59)  Prec@50=43.75 (46.30)
+2025-03-03 03:04:52 | INFO     | utils.misc:108 - Training: Epoch=[4/50] [1100/1759]  Batch=1.31 (1.47)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.6672 (0.6503)  IoU=51.07 (46.69)  Prec@50=51.29 (46.50)
+2025-03-03 03:07:16 | INFO     | utils.misc:108 - Training: Epoch=[4/50] [1200/1759]  Batch=1.22 (1.47)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.6364 (0.6505)  IoU=51.87 (46.69)  Prec@50=51.93 (46.48)
+2025-03-03 03:09:43 | INFO     | utils.misc:108 - Training: Epoch=[4/50] [1300/1759]  Batch=1.37 (1.47)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.4925 (0.6489)  IoU=57.07 (46.74)  Prec@50=55.01 (46.50)
+2025-03-03 03:12:10 | INFO     | utils.misc:108 - Training: Epoch=[4/50] [1400/1759]  Batch=1.79 (1.47)  Data=0.01 (0.03)  Lr=0.000100  Loss=0.6326 (0.6491)  IoU=47.66 (46.81)  Prec@50=46.04 (46.58)
+2025-03-03 03:14:35 | INFO     | utils.misc:108 - Training: Epoch=[4/50] [1500/1759]  Batch=1.36 (1.47)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.6763 (0.6487)  IoU=52.54 (46.97)  Prec@50=58.58 (46.77)
+2025-03-03 03:17:03 | INFO     | utils.misc:108 - Training: Epoch=[4/50] [1600/1759]  Batch=1.41 (1.47)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.6583 (0.6469)  IoU=48.20 (47.13)  Prec@50=50.83 (46.97)
+2025-03-03 03:19:31 | INFO     | utils.misc:108 - Training: Epoch=[4/50] [1700/1759]  Batch=1.27 (1.47)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.6820 (0.6478)  IoU=43.32 (47.12)  Prec@50=48.66 (46.90)
+2025-03-03 03:21:35 | INFO     | engine.engine_gref:166 - Evaluation: Epoch=[4/50]  mIoU=54.75  oIoU=50.99  Pr@50: 57.73  Pr@60: 48.80  Pr@70: 40.37  Pr@80: 29.04  Pr@90: 12.15
+2025-03-03 03:24:28 | INFO     | utils.misc:108 - Training: Epoch=[5/50] [ 100/1759]  Batch=1.28 (1.46)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.5842 (0.5822)  IoU=52.98 (50.21)  Prec@50=50.89 (51.29)
+2025-03-03 03:26:54 | INFO     | utils.misc:108 - Training: Epoch=[5/50] [ 200/1759]  Batch=1.42 (1.46)  Data=0.01 (0.03)  Lr=0.000100  Loss=0.5671 (0.5931)  IoU=52.04 (49.29)  Prec@50=67.11 (50.16)
+2025-03-03 03:29:22 | INFO     | utils.misc:108 - Training: Epoch=[5/50] [ 300/1759]  Batch=1.35 (1.47)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.6269 (0.5962)  IoU=49.78 (48.97)  Prec@50=53.17 (49.76)
+2025-03-03 03:31:49 | INFO     | utils.misc:108 - Training: Epoch=[5/50] [ 400/1759]  Batch=1.48 (1.47)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.5380 (0.5930)  IoU=53.36 (49.08)  Prec@50=60.08 (50.03)
+2025-03-03 03:34:16 | INFO     | utils.misc:108 - Training: Epoch=[5/50] [ 500/1759]  Batch=1.79 (1.47)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.6159 (0.5929)  IoU=46.81 (49.18)  Prec@50=45.49 (50.33)
+2025-03-03 03:36:45 | INFO     | utils.misc:108 - Training: Epoch=[5/50] [ 600/1759]  Batch=1.43 (1.47)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.4669 (0.5917)  IoU=56.32 (49.45)  Prec@50=55.22 (50.71)
+2025-03-03 03:39:10 | INFO     | utils.misc:108 - Training: Epoch=[5/50] [ 700/1759]  Batch=1.57 (1.47)  Data=0.00 (0.02)  Lr=0.000100  Loss=0.6299 (0.5933)  IoU=44.67 (49.38)  Prec@50=45.40 (50.54)
+2025-03-03 03:41:37 | INFO     | utils.misc:108 - Training: Epoch=[5/50] [ 800/1759]  Batch=1.43 (1.47)  Data=0.00 (0.02)  Lr=0.000100  Loss=0.7958 (0.5949)  IoU=42.46 (49.27)  Prec@50=42.29 (50.46)
+2025-03-03 03:44:03 | INFO     | utils.misc:108 - Training: Epoch=[5/50] [ 900/1759]  Batch=1.39 (1.47)  Data=0.00 (0.02)  Lr=0.000100  Loss=0.7949 (0.5958)  IoU=40.19 (49.17)  Prec@50=37.70 (50.46)
+2025-03-03 03:46:30 | INFO     | utils.misc:108 - Training: Epoch=[5/50] [1000/1759]  Batch=1.75 (1.47)  Data=0.00 (0.02)  Lr=0.000100  Loss=0.4842 (0.5962)  IoU=55.96 (49.27)  Prec@50=60.33 (50.63)
+2025-03-03 03:48:56 | INFO     | utils.misc:108 - Training: Epoch=[5/50] [1100/1759]  Batch=1.34 (1.47)  Data=0.00 (0.02)  Lr=0.000100  Loss=0.5489 (0.5967)  IoU=55.96 (49.18)  Prec@50=60.76 (50.47)
+2025-03-03 03:51:24 | INFO     | utils.misc:108 - Training: Epoch=[5/50] [1200/1759]  Batch=1.77 (1.47)  Data=0.00 (0.02)  Lr=0.000100  Loss=0.5284 (0.5968)  IoU=48.37 (49.16)  Prec@50=56.47 (50.49)
+2025-03-03 03:53:48 | INFO     | utils.misc:108 - Training: Epoch=[5/50] [1300/1759]  Batch=1.36 (1.47)  Data=0.00 (0.02)  Lr=0.000100  Loss=0.5359 (0.5963)  IoU=50.98 (49.14)  Prec@50=49.26 (50.53)
+2025-03-03 03:56:18 | INFO     | utils.misc:108 - Training: Epoch=[5/50] [1400/1759]  Batch=1.38 (1.47)  Data=0.00 (0.02)  Lr=0.000100  Loss=0.8166 (0.5978)  IoU=42.17 (48.99)  Prec@50=36.31 (50.28)
+2025-03-03 03:58:44 | INFO     | utils.misc:108 - Training: Epoch=[5/50] [1500/1759]  Batch=1.36 (1.47)  Data=0.00 (0.02)  Lr=0.000100  Loss=0.6011 (0.5979)  IoU=48.88 (49.04)  Prec@50=53.27 (50.35)
+2025-03-03 04:01:11 | INFO     | utils.misc:108 - Training: Epoch=[5/50] [1600/1759]  Batch=1.35 (1.47)  Data=0.00 (0.02)  Lr=0.000100  Loss=0.7017 (0.5988)  IoU=47.34 (48.99)  Prec@50=45.04 (50.25)
+2025-03-03 04:03:39 | INFO     | utils.misc:108 - Training: Epoch=[5/50] [1700/1759]  Batch=1.66 (1.47)  Data=0.00 (0.02)  Lr=0.000100  Loss=0.5102 (0.5985)  IoU=51.01 (49.05)  Prec@50=53.77 (50.32)
+2025-03-03 04:05:41 | INFO     | engine.engine_gref:166 - Evaluation: Epoch=[5/50]  mIoU=56.95  oIoU=53.30  Pr@50: 60.75  Pr@60: 53.26  Pr@70: 44.57  Pr@80: 32.76  Pr@90: 14.32
+2025-03-03 04:08:38 | INFO     | utils.misc:108 - Training: Epoch=[6/50] [ 100/1759]  Batch=1.27 (1.49)  Data=0.00 (0.04)  Lr=0.000100  Loss=0.6468 (0.5652)  IoU=49.76 (51.26)  Prec@50=42.71 (53.73)
+2025-03-03 04:11:06 | INFO     | utils.misc:108 - Training: Epoch=[6/50] [ 200/1759]  Batch=1.41 (1.49)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.6561 (0.5522)  IoU=44.92 (52.29)  Prec@50=38.89 (54.84)
+2025-03-03 04:13:33 | INFO     | utils.misc:108 - Training: Epoch=[6/50] [ 300/1759]  Batch=1.39 (1.48)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.4852 (0.5548)  IoU=52.33 (51.63)  Prec@50=54.51 (54.21)
+2025-03-03 04:15:59 | INFO     | utils.misc:108 - Training: Epoch=[6/50] [ 400/1759]  Batch=1.37 (1.48)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.5306 (0.5601)  IoU=55.68 (51.13)  Prec@50=62.65 (53.69)
+2025-03-03 04:18:27 | INFO     | utils.misc:108 - Training: Epoch=[6/50] [ 500/1759]  Batch=1.39 (1.48)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.4842 (0.5597)  IoU=56.85 (51.28)  Prec@50=55.85 (53.78)
+2025-03-03 04:20:53 | INFO     | utils.misc:108 - Training: Epoch=[6/50] [ 600/1759]  Batch=1.39 (1.47)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.5329 (0.5586)  IoU=47.32 (51.24)  Prec@50=50.00 (53.65)
+2025-03-03 04:23:18 | INFO     | utils.misc:108 - Training: Epoch=[6/50] [ 700/1759]  Batch=1.14 (1.47)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.5324 (0.5594)  IoU=56.30 (51.25)  Prec@50=59.52 (53.65)
+2025-03-03 04:25:47 | INFO     | utils.misc:108 - Training: Epoch=[6/50] [ 800/1759]  Batch=1.23 (1.47)  Data=0.00 (0.02)  Lr=0.000100  Loss=0.5867 (0.5610)  IoU=50.78 (51.24)  Prec@50=52.38 (53.66)
+2025-03-03 04:28:14 | INFO     | utils.misc:108 - Training: Epoch=[6/50] [ 900/1759]  Batch=1.49 (1.47)  Data=0.00 (0.02)  Lr=0.000100  Loss=0.5270 (0.5646)  IoU=51.31 (51.14)  Prec@50=48.21 (53.48)
+2025-03-03 04:30:41 | INFO     | utils.misc:108 - Training: Epoch=[6/50] [1000/1759]  Batch=1.35 (1.47)  Data=0.00 (0.02)  Lr=0.000100  Loss=0.5509 (0.5655)  IoU=52.33 (51.12)  Prec@50=51.79 (53.51)
+2025-03-03 04:33:09 | INFO     | utils.misc:108 - Training: Epoch=[6/50] [1100/1759]  Batch=1.25 (1.47)  Data=0.00 (0.02)  Lr=0.000100  Loss=0.5011 (0.5665)  IoU=59.98 (51.03)  Prec@50=73.66 (53.37)
+2025-03-03 04:35:34 | INFO     | utils.misc:108 - Training: Epoch=[6/50] [1200/1759]  Batch=1.13 (1.47)  Data=0.00 (0.02)  Lr=0.000100  Loss=0.7173 (0.5666)  IoU=49.50 (51.04)  Prec@50=52.98 (53.44)
+2025-03-03 04:38:02 | INFO     | utils.misc:108 - Training: Epoch=[6/50] [1300/1759]  Batch=1.78 (1.47)  Data=0.00 (0.02)  Lr=0.000100  Loss=0.6343 (0.5665)  IoU=46.32 (50.93)  Prec@50=45.29 (53.32)
+2025-03-03 04:40:29 | INFO     | utils.misc:108 - Training: Epoch=[6/50] [1400/1759]  Batch=1.27 (1.47)  Data=0.00 (0.02)  Lr=0.000100  Loss=0.5961 (0.5674)  IoU=49.77 (50.91)  Prec@50=50.00 (53.32)
+2025-03-03 04:42:57 | INFO     | utils.misc:108 - Training: Epoch=[6/50] [1500/1759]  Batch=1.43 (1.47)  Data=0.00 (0.02)  Lr=0.000100  Loss=0.4715 (0.5672)  IoU=52.89 (50.96)  Prec@50=54.68 (53.44)
+2025-03-03 04:45:21 | INFO     | utils.misc:108 - Training: Epoch=[6/50] [1600/1759]  Batch=1.78 (1.47)  Data=0.00 (0.02)  Lr=0.000100  Loss=0.5379 (0.5673)  IoU=52.68 (50.94)  Prec@50=58.28 (53.43)
+2025-03-03 04:47:46 | INFO     | utils.misc:108 - Training: Epoch=[6/50] [1700/1759]  Batch=1.70 (1.47)  Data=0.00 (0.02)  Lr=0.000100  Loss=0.6034 (0.5678)  IoU=48.31 (50.90)  Prec@50=47.55 (53.37)
+2025-03-03 04:49:50 | INFO     | engine.engine_gref:166 - Evaluation: Epoch=[6/50]  mIoU=57.77  oIoU=53.88  Pr@50: 62.42  Pr@60: 54.46  Pr@70: 45.96  Pr@80: 34.74  Pr@90: 15.64
+2025-03-03 04:52:43 | INFO     | utils.misc:108 - Training: Epoch=[7/50] [ 100/1759]  Batch=1.26 (1.46)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.4772 (0.5159)  IoU=58.15 (52.95)  Prec@50=68.60 (57.46)
+2025-03-03 04:55:10 | INFO     | utils.misc:108 - Training: Epoch=[7/50] [ 200/1759]  Batch=1.26 (1.46)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.5554 (0.5202)  IoU=53.89 (52.93)  Prec@50=58.48 (57.39)
+2025-03-03 04:57:36 | INFO     | utils.misc:108 - Training: Epoch=[7/50] [ 300/1759]  Batch=1.34 (1.46)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.4606 (0.5208)  IoU=58.93 (53.09)  Prec@50=64.43 (57.32)
+2025-03-03 05:00:03 | INFO     | utils.misc:108 - Training: Epoch=[7/50] [ 400/1759]  Batch=1.41 (1.46)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.4532 (0.5196)  IoU=59.22 (53.45)  Prec@50=74.80 (57.58)
+2025-03-03 05:02:29 | INFO     | utils.misc:108 - Training: Epoch=[7/50] [ 500/1759]  Batch=1.38 (1.46)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.5778 (0.5210)  IoU=59.85 (53.71)  Prec@50=80.95 (57.77)
+2025-03-03 05:04:55 | INFO     | utils.misc:108 - Training: Epoch=[7/50] [ 600/1759]  Batch=1.72 (1.46)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.5753 (0.5226)  IoU=56.04 (53.67)  Prec@50=55.98 (57.49)
+2025-03-03 05:07:23 | INFO     | utils.misc:108 - Training: Epoch=[7/50] [ 700/1759]  Batch=1.36 (1.47)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.6087 (0.5253)  IoU=51.54 (53.41)  Prec@50=50.00 (57.11)
+2025-03-03 05:09:49 | INFO     | utils.misc:108 - Training: Epoch=[7/50] [ 800/1759]  Batch=1.36 (1.46)  Data=0.00 (0.02)  Lr=0.000100  Loss=0.6311 (0.5259)  IoU=55.29 (53.46)  Prec@50=55.51 (57.13)
+2025-03-03 05:12:16 | INFO     | utils.misc:108 - Training: Epoch=[7/50] [ 900/1759]  Batch=1.26 (1.47)  Data=0.00 (0.02)  Lr=0.000100  Loss=0.7219 (0.5281)  IoU=49.02 (53.55)  Prec@50=53.87 (57.31)
+2025-03-03 05:14:42 | INFO     | utils.misc:108 - Training: Epoch=[7/50] [1000/1759]  Batch=1.43 (1.46)  Data=0.00 (0.02)  Lr=0.000100  Loss=0.4692 (0.5275)  IoU=65.16 (53.58)  Prec@50=78.17 (57.38)
+2025-03-03 05:17:07 | INFO     | utils.misc:108 - Training: Epoch=[7/50] [1100/1759]  Batch=1.26 (1.46)  Data=0.00 (0.02)  Lr=0.000100  Loss=0.5824 (0.5272)  IoU=45.79 (53.59)  Prec@50=48.66 (57.39)
+2025-03-03 05:19:35 | INFO     | utils.misc:108 - Training: Epoch=[7/50] [1200/1759]  Batch=1.81 (1.46)  Data=0.00 (0.02)  Lr=0.000100  Loss=0.4784 (0.5274)  IoU=53.63 (53.55)  Prec@50=54.79 (57.31)
+2025-03-03 05:22:04 | INFO     | utils.misc:108 - Training: Epoch=[7/50] [1300/1759]  Batch=1.72 (1.47)  Data=0.00 (0.02)  Lr=0.000100  Loss=0.5570 (0.5278)  IoU=51.42 (53.54)  Prec@50=50.05 (57.32)
+2025-03-03 05:24:31 | INFO     | utils.misc:108 - Training: Epoch=[7/50] [1400/1759]  Batch=1.28 (1.47)  Data=0.00 (0.02)  Lr=0.000100  Loss=0.4228 (0.5272)  IoU=57.91 (53.52)  Prec@50=65.62 (57.28)
+2025-03-03 05:27:00 | INFO     | utils.misc:108 - Training: Epoch=[7/50] [1500/1759]  Batch=1.83 (1.47)  Data=0.01 (0.02)  Lr=0.000100  Loss=0.4513 (0.5273)  IoU=58.91 (53.56)  Prec@50=49.84 (57.28)
+2025-03-03 05:29:27 | INFO     | utils.misc:108 - Training: Epoch=[7/50] [1600/1759]  Batch=1.39 (1.47)  Data=0.00 (0.02)  Lr=0.000100  Loss=0.4685 (0.5275)  IoU=58.79 (53.56)  Prec@50=64.34 (57.24)
+2025-03-03 05:31:54 | INFO     | utils.misc:108 - Training: Epoch=[7/50] [1700/1759]  Batch=1.62 (1.47)  Data=0.00 (0.02)  Lr=0.000100  Loss=0.4928 (0.5272)  IoU=47.91 (53.57)  Prec@50=45.88 (57.28)
+2025-03-03 05:33:56 | INFO     | engine.engine_gref:166 - Evaluation: Epoch=[7/50]  mIoU=60.25  oIoU=55.60  Pr@50: 65.68  Pr@60: 59.39  Pr@70: 51.09  Pr@80: 38.20  Pr@90: 17.59
+2025-03-03 05:36:51 | INFO     | utils.misc:108 - Training: Epoch=[8/50] [ 100/1759]  Batch=1.44 (1.48)  Data=0.00 (0.04)  Lr=0.000100  Loss=0.4619 (0.4767)  IoU=50.05 (57.19)  Prec@50=56.96 (61.73)
+2025-03-03 05:39:19 | INFO     | utils.misc:108 - Training: Epoch=[8/50] [ 200/1759]  Batch=1.33 (1.48)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.4224 (0.4827)  IoU=60.05 (56.71)  Prec@50=67.26 (61.55)
+2025-03-03 05:41:44 | INFO     | utils.misc:108 - Training: Epoch=[8/50] [ 300/1759]  Batch=1.34 (1.47)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.4845 (0.4818)  IoU=55.20 (56.42)  Prec@50=65.23 (61.33)
+2025-03-03 05:44:13 | INFO     | utils.misc:108 - Training: Epoch=[8/50] [ 400/1759]  Batch=1.95 (1.47)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.5197 (0.4854)  IoU=51.30 (56.06)  Prec@50=53.19 (60.90)
+2025-03-03 05:46:43 | INFO     | utils.misc:108 - Training: Epoch=[8/50] [ 500/1759]  Batch=1.39 (1.48)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.5332 (0.4882)  IoU=48.13 (55.89)  Prec@50=51.34 (60.70)
+2025-03-03 05:49:08 | INFO     | utils.misc:108 - Training: Epoch=[8/50] [ 600/1759]  Batch=1.43 (1.47)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.3839 (0.4883)  IoU=52.71 (55.88)  Prec@50=55.28 (60.68)
+2025-03-03 05:51:36 | INFO     | utils.misc:108 - Training: Epoch=[8/50] [ 700/1759]  Batch=1.25 (1.48)  Data=0.00 (0.02)  Lr=0.000100  Loss=0.4711 (0.4896)  IoU=53.68 (55.75)  Prec@50=63.10 (60.51)
+2025-03-03 05:54:02 | INFO     | utils.misc:108 - Training: Epoch=[8/50] [ 800/1759]  Batch=1.84 (1.47)  Data=0.00 (0.02)  Lr=0.000100  Loss=0.3726 (0.4901)  IoU=57.30 (55.78)  Prec@50=64.32 (60.39)
+2025-03-03 05:56:27 | INFO     | utils.misc:108 - Training: Epoch=[8/50] [ 900/1759]  Batch=1.25 (1.47)  Data=0.00 (0.02)  Lr=0.000100  Loss=0.4996 (0.4914)  IoU=56.04 (55.96)  Prec@50=54.91 (60.61)
+2025-03-03 05:58:54 | INFO     | utils.misc:108 - Training: Epoch=[8/50] [1000/1759]  Batch=1.41 (1.47)  Data=0.01 (0.02)  Lr=0.000100  Loss=0.5232 (0.4923)  IoU=60.26 (55.92)  Prec@50=69.84 (60.50)
+2025-03-03 06:01:18 | INFO     | utils.misc:108 - Training: Epoch=[8/50] [1100/1759]  Batch=1.80 (1.47)  Data=0.01 (0.02)  Lr=0.000100  Loss=0.5720 (0.4947)  IoU=50.41 (55.79)  Prec@50=54.66 (60.34)
+2025-03-03 06:03:42 | INFO     | utils.misc:108 - Training: Epoch=[8/50] [1200/1759]  Batch=1.44 (1.47)  Data=0.00 (0.02)  Lr=0.000100  Loss=0.4019 (0.4951)  IoU=61.91 (55.80)  Prec@50=69.10 (60.33)
+2025-03-03 06:06:09 | INFO     | utils.misc:108 - Training: Epoch=[8/50] [1300/1759]  Batch=1.35 (1.47)  Data=0.00 (0.02)  Lr=0.000100  Loss=0.5695 (0.4957)  IoU=54.77 (55.78)  Prec@50=55.75 (60.21)
+2025-03-03 06:08:38 | INFO     | utils.misc:108 - Training: Epoch=[8/50] [1400/1759]  Batch=1.84 (1.47)  Data=0.00 (0.02)  Lr=0.000100  Loss=0.6492 (0.4982)  IoU=42.49 (55.71)  Prec@50=42.14 (60.10)
+2025-03-03 06:11:03 | INFO     | utils.misc:108 - Training: Epoch=[8/50] [1500/1759]  Batch=1.36 (1.47)  Data=0.00 (0.02)  Lr=0.000100  Loss=0.5565 (0.4978)  IoU=49.92 (55.73)  Prec@50=46.83 (60.09)
+2025-03-03 06:13:31 | INFO     | utils.misc:108 - Training: Epoch=[8/50] [1600/1759]  Batch=1.30 (1.47)  Data=0.00 (0.02)  Lr=0.000100  Loss=0.4775 (0.4973)  IoU=63.03 (55.75)  Prec@50=68.60 (60.11)
+2025-03-03 06:16:00 | INFO     | utils.misc:108 - Training: Epoch=[8/50] [1700/1759]  Batch=1.34 (1.47)  Data=0.00 (0.02)  Lr=0.000100  Loss=0.4690 (0.4972)  IoU=55.66 (55.79)  Prec@50=46.83 (60.15)
+2025-03-03 06:18:02 | INFO     | engine.engine_gref:166 - Evaluation: Epoch=[8/50]  mIoU=60.71  oIoU=56.70  Pr@50: 66.73  Pr@60: 59.36  Pr@70: 50.62  Pr@80: 38.63  Pr@90: 16.96
+2025-03-03 06:20:56 | INFO     | utils.misc:108 - Training: Epoch=[9/50] [ 100/1759]  Batch=1.45 (1.47)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.3857 (0.4604)  IoU=59.86 (58.29)  Prec@50=64.48 (63.98)
+2025-03-03 06:23:21 | INFO     | utils.misc:108 - Training: Epoch=[9/50] [ 200/1759]  Batch=1.28 (1.46)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.4570 (0.4735)  IoU=63.38 (57.40)  Prec@50=73.66 (62.69)
+2025-03-03 06:25:48 | INFO     | utils.misc:108 - Training: Epoch=[9/50] [ 300/1759]  Batch=1.43 (1.46)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.5468 (0.4705)  IoU=55.90 (57.64)  Prec@50=59.08 (62.91)
+2025-03-03 06:28:13 | INFO     | utils.misc:108 - Training: Epoch=[9/50] [ 400/1759]  Batch=1.28 (1.46)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.4366 (0.4683)  IoU=63.43 (57.61)  Prec@50=68.75 (62.74)
+2025-03-03 06:30:38 | INFO     | utils.misc:108 - Training: Epoch=[9/50] [ 500/1759]  Batch=1.31 (1.46)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.6139 (0.4667)  IoU=54.24 (57.58)  Prec@50=55.75 (62.60)
+2025-03-03 06:33:03 | INFO     | utils.misc:108 - Training: Epoch=[9/50] [ 600/1759]  Batch=1.22 (1.46)  Data=0.00 (0.02)  Lr=0.000100  Loss=0.5159 (0.4657)  IoU=58.25 (57.54)  Prec@50=59.82 (62.54)
+2025-03-03 06:35:27 | INFO     | utils.misc:108 - Training: Epoch=[9/50] [ 700/1759]  Batch=1.20 (1.45)  Data=0.00 (0.02)  Lr=0.000100  Loss=0.4972 (0.4664)  IoU=57.24 (57.36)  Prec@50=67.56 (62.27)
+2025-03-03 06:37:55 | INFO     | utils.misc:108 - Training: Epoch=[9/50] [ 800/1759]  Batch=1.37 (1.46)  Data=0.00 (0.02)  Lr=0.000100  Loss=0.5249 (0.4666)  IoU=58.34 (57.30)  Prec@50=60.76 (62.13)
+2025-03-03 06:40:21 | INFO     | utils.misc:108 - Training: Epoch=[9/50] [ 900/1759]  Batch=1.27 (1.46)  Data=0.00 (0.02)  Lr=0.000100  Loss=0.5121 (0.4688)  IoU=55.35 (57.25)  Prec@50=52.23 (62.01)
+2025-03-03 06:42:47 | INFO     | utils.misc:108 - Training: Epoch=[9/50] [1000/1759]  Batch=1.45 (1.46)  Data=0.00 (0.02)  Lr=0.000100  Loss=0.5049 (0.4697)  IoU=53.27 (57.15)  Prec@50=58.41 (61.89)
+2025-03-03 06:45:12 | INFO     | utils.misc:108 - Training: Epoch=[9/50] [1100/1759]  Batch=1.25 (1.46)  Data=0.00 (0.02)  Lr=0.000100  Loss=0.5198 (0.4704)  IoU=57.83 (57.14)  Prec@50=58.48 (61.89)
+2025-03-03 06:47:41 | INFO     | utils.misc:108 - Training: Epoch=[9/50] [1200/1759]  Batch=1.39 (1.46)  Data=0.00 (0.02)  Lr=0.000100  Loss=0.4482 (0.4701)  IoU=56.85 (57.09)  Prec@50=62.10 (61.84)
+2025-03-03 06:50:09 | INFO     | utils.misc:108 - Training: Epoch=[9/50] [1300/1759]  Batch=1.39 (1.46)  Data=0.00 (0.02)  Lr=0.000100  Loss=0.6194 (0.4711)  IoU=45.18 (57.02)  Prec@50=49.31 (61.76)
+2025-03-03 06:52:37 | INFO     | utils.misc:108 - Training: Epoch=[9/50] [1400/1759]  Batch=1.26 (1.46)  Data=0.00 (0.02)  Lr=0.000100  Loss=0.5719 (0.4718)  IoU=53.01 (56.94)  Prec@50=56.99 (61.68)
+2025-03-03 06:55:05 | INFO     | utils.misc:108 - Training: Epoch=[9/50] [1500/1759]  Batch=1.31 (1.46)  Data=0.00 (0.02)  Lr=0.000100  Loss=0.3935 (0.4720)  IoU=62.80 (56.86)  Prec@50=68.85 (61.58)
+2025-03-03 06:57:30 | INFO     | utils.misc:108 - Training: Epoch=[9/50] [1600/1759]  Batch=1.32 (1.46)  Data=0.00 (0.02)  Lr=0.000100  Loss=0.5472 (0.4724)  IoU=60.17 (56.79)  Prec@50=65.97 (61.48)
+2025-03-03 06:59:58 | INFO     | utils.misc:108 - Training: Epoch=[9/50] [1700/1759]  Batch=1.38 (1.46)  Data=0.00 (0.02)  Lr=0.000100  Loss=0.3759 (0.4721)  IoU=60.89 (56.73)  Prec@50=63.24 (61.42)
+2025-03-03 07:01:58 | INFO     | engine.engine_gref:166 - Evaluation: Epoch=[9/50]  mIoU=61.79  oIoU=57.95  Pr@50: 68.91  Pr@60: 61.84  Pr@70: 54.15  Pr@80: 40.99  Pr@90: 18.32
+2025-03-03 07:04:52 | INFO     | utils.misc:108 - Training: Epoch=[10/50] [ 100/1759]  Batch=1.42 (1.47)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.4251 (0.4314)  IoU=59.89 (58.14)  Prec@50=68.54 (63.57)
+2025-03-03 07:07:22 | INFO     | utils.misc:108 - Training: Epoch=[10/50] [ 200/1759]  Batch=1.69 (1.48)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.4248 (0.4374)  IoU=62.13 (58.02)  Prec@50=67.50 (63.52)
+2025-03-03 07:09:47 | INFO     | utils.misc:108 - Training: Epoch=[10/50] [ 300/1759]  Batch=1.39 (1.47)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.3592 (0.4369)  IoU=56.82 (58.28)  Prec@50=68.06 (63.59)
+2025-03-03 07:12:15 | INFO     | utils.misc:108 - Training: Epoch=[10/50] [ 400/1759]  Batch=1.35 (1.47)  Data=0.01 (0.03)  Lr=0.000100  Loss=0.4061 (0.4374)  IoU=64.88 (58.70)  Prec@50=70.29 (63.98)
+2025-03-03 07:14:41 | INFO     | utils.misc:108 - Training: Epoch=[10/50] [ 500/1759]  Batch=1.37 (1.47)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.4022 (0.4395)  IoU=60.43 (58.72)  Prec@50=59.42 (63.88)
+2025-03-03 07:17:09 | INFO     | utils.misc:108 - Training: Epoch=[10/50] [ 600/1759]  Batch=1.32 (1.47)  Data=0.00 (0.02)  Lr=0.000100  Loss=0.6635 (0.4405)  IoU=42.66 (58.58)  Prec@50=49.95 (63.78)
+2025-03-03 07:19:37 | INFO     | utils.misc:108 - Training: Epoch=[10/50] [ 700/1759]  Batch=1.27 (1.47)  Data=0.00 (0.02)  Lr=0.000100  Loss=0.4055 (0.4416)  IoU=63.60 (58.54)  Prec@50=67.86 (63.75)
+2025-03-03 07:22:05 | INFO     | utils.misc:108 - Training: Epoch=[10/50] [ 800/1759]  Batch=1.22 (1.47)  Data=0.00 (0.02)  Lr=0.000100  Loss=0.3670 (0.4434)  IoU=68.35 (58.42)  Prec@50=73.66 (63.65)
+2025-03-03 07:24:32 | INFO     | utils.misc:108 - Training: Epoch=[10/50] [ 900/1759]  Batch=1.27 (1.47)  Data=0.00 (0.02)  Lr=0.000100  Loss=0.4577 (0.4437)  IoU=59.77 (58.38)  Prec@50=72.02 (63.55)
+2025-03-03 07:26:55 | INFO     | utils.misc:108 - Training: Epoch=[10/50] [1000/1759]  Batch=1.38 (1.47)  Data=0.00 (0.02)  Lr=0.000100  Loss=0.3914 (0.4433)  IoU=56.84 (58.38)  Prec@50=71.92 (63.59)
+2025-03-03 07:29:24 | INFO     | utils.misc:108 - Training: Epoch=[10/50] [1100/1759]  Batch=1.40 (1.47)  Data=0.00 (0.02)  Lr=0.000100  Loss=0.4301 (0.4452)  IoU=64.13 (58.34)  Prec@50=64.24 (63.54)
+2025-03-03 07:31:49 | INFO     | utils.misc:108 - Training: Epoch=[10/50] [1200/1759]  Batch=1.78 (1.47)  Data=0.00 (0.02)  Lr=0.000100  Loss=0.5309 (0.4448)  IoU=51.24 (58.37)  Prec@50=55.62 (63.55)
+2025-03-03 07:34:15 | INFO     | utils.misc:108 - Training: Epoch=[10/50] [1300/1759]  Batch=1.31 (1.47)  Data=0.00 (0.02)  Lr=0.000100  Loss=0.5097 (0.4461)  IoU=53.53 (58.35)  Prec@50=59.08 (63.49)
+2025-03-03 07:36:42 | INFO     | utils.misc:108 - Training: Epoch=[10/50] [1400/1759]  Batch=1.40 (1.47)  Data=0.00 (0.02)  Lr=0.000100  Loss=0.4394 (0.4461)  IoU=58.68 (58.39)  Prec@50=59.52 (63.55)
+2025-03-03 07:39:05 | INFO     | utils.misc:108 - Training: Epoch=[10/50] [1500/1759]  Batch=1.39 (1.47)  Data=0.00 (0.02)  Lr=0.000100  Loss=0.4440 (0.4464)  IoU=59.17 (58.38)  Prec@50=67.06 (63.51)
+2025-03-03 07:41:31 | INFO     | utils.misc:108 - Training: Epoch=[10/50] [1600/1759]  Batch=1.34 (1.47)  Data=0.00 (0.02)  Lr=0.000100  Loss=0.4039 (0.4464)  IoU=56.05 (58.33)  Prec@50=60.76 (63.41)
+2025-03-03 07:43:56 | INFO     | utils.misc:108 - Training: Epoch=[10/50] [1700/1759]  Batch=1.61 (1.46)  Data=0.00 (0.02)  Lr=0.000100  Loss=0.4076 (0.4472)  IoU=57.63 (58.25)  Prec@50=65.15 (63.35)
+2025-03-03 07:46:00 | INFO     | engine.engine_gref:166 - Evaluation: Epoch=[10/50]  mIoU=61.95  oIoU=58.15  Pr@50: 69.22  Pr@60: 62.58  Pr@70: 54.46  Pr@80: 42.31  Pr@90: 19.91
+2025-03-03 07:48:54 | INFO     | utils.misc:108 - Training: Epoch=[11/50] [ 100/1759]  Batch=1.31 (1.47)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.4570 (0.3980)  IoU=60.56 (59.63)  Prec@50=61.71 (65.29)
+2025-03-03 07:51:22 | INFO     | utils.misc:108 - Training: Epoch=[11/50] [ 200/1759]  Batch=1.44 (1.48)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.4088 (0.4088)  IoU=55.49 (58.90)  Prec@50=56.47 (64.34)
+2025-03-03 07:53:49 | INFO     | utils.misc:108 - Training: Epoch=[11/50] [ 300/1759]  Batch=1.34 (1.47)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.5129 (0.4076)  IoU=58.42 (59.49)  Prec@50=58.73 (65.27)
+2025-03-03 07:56:14 | INFO     | utils.misc:108 - Training: Epoch=[11/50] [ 400/1759]  Batch=1.36 (1.47)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.5632 (0.4112)  IoU=48.45 (59.59)  Prec@50=45.14 (65.45)
+2025-03-03 07:58:39 | INFO     | utils.misc:108 - Training: Epoch=[11/50] [ 500/1759]  Batch=1.45 (1.46)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.3631 (0.4145)  IoU=60.37 (59.62)  Prec@50=61.03 (65.47)
+2025-03-03 08:01:04 | INFO     | utils.misc:108 - Training: Epoch=[11/50] [ 600/1759]  Batch=1.42 (1.46)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.2917 (0.4155)  IoU=71.40 (59.60)  Prec@50=78.17 (65.43)
+2025-03-03 08:03:30 | INFO     | utils.misc:108 - Training: Epoch=[11/50] [ 700/1759]  Batch=1.44 (1.46)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.4789 (0.4174)  IoU=58.62 (59.62)  Prec@50=61.34 (65.38)
+2025-03-03 08:05:57 | INFO     | utils.misc:108 - Training: Epoch=[11/50] [ 800/1759]  Batch=1.33 (1.46)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.3516 (0.4210)  IoU=60.97 (59.48)  Prec@50=69.59 (65.11)
+2025-03-03 08:08:21 | INFO     | utils.misc:108 - Training: Epoch=[11/50] [ 900/1759]  Batch=1.95 (1.46)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.3417 (0.4205)  IoU=59.15 (59.43)  Prec@50=67.80 (65.06)
+2025-03-03 08:10:47 | INFO     | utils.misc:108 - Training: Epoch=[11/50] [1000/1759]  Batch=1.21 (1.46)  Data=0.00 (0.02)  Lr=0.000100  Loss=0.3900 (0.4195)  IoU=64.07 (59.51)  Prec@50=68.75 (65.12)
+2025-03-03 08:13:12 | INFO     | utils.misc:108 - Training: Epoch=[11/50] [1100/1759]  Batch=1.51 (1.46)  Data=0.00 (0.02)  Lr=0.000100  Loss=0.3569 (0.4213)  IoU=69.53 (59.50)  Prec@50=74.52 (65.06)
+2025-03-03 08:15:38 | INFO     | utils.misc:108 - Training: Epoch=[11/50] [1200/1759]  Batch=1.22 (1.46)  Data=0.01 (0.02)  Lr=0.000100  Loss=0.5098 (0.4224)  IoU=58.16 (59.38)  Prec@50=62.05 (64.97)
+2025-03-03 08:18:03 | INFO     | utils.misc:108 - Training: Epoch=[11/50] [1300/1759]  Batch=1.35 (1.46)  Data=0.00 (0.02)  Lr=0.000100  Loss=0.6300 (0.4239)  IoU=47.06 (59.24)  Prec@50=51.79 (64.82)
+2025-03-03 08:20:26 | INFO     | utils.misc:108 - Training: Epoch=[11/50] [1400/1759]  Batch=1.32 (1.46)  Data=0.01 (0.02)  Lr=0.000100  Loss=0.3446 (0.4252)  IoU=64.94 (59.19)  Prec@50=75.60 (64.73)
+2025-03-03 08:22:51 | INFO     | utils.misc:108 - Training: Epoch=[11/50] [1500/1759]  Batch=1.28 (1.46)  Data=0.00 (0.02)  Lr=0.000100  Loss=0.4085 (0.4247)  IoU=60.33 (59.26)  Prec@50=66.96 (64.89)
+2025-03-03 08:25:17 | INFO     | utils.misc:108 - Training: Epoch=[11/50] [1600/1759]  Batch=1.81 (1.46)  Data=0.00 (0.02)  Lr=0.000100  Loss=0.2942 (0.4246)  IoU=60.72 (59.32)  Prec@50=59.59 (64.92)
+2025-03-03 08:27:44 | INFO     | utils.misc:108 - Training: Epoch=[11/50] [1700/1759]  Batch=1.20 (1.46)  Data=0.00 (0.02)  Lr=0.000100  Loss=0.3268 (0.4244)  IoU=71.48 (59.39)  Prec@50=83.04 (64.99)
+2025-03-03 08:29:47 | INFO     | engine.engine_gref:166 - Evaluation: Epoch=[11/50]  mIoU=62.68  oIoU=58.89  Pr@50: 69.95  Pr@60: 62.97  Pr@70: 54.85  Pr@80: 43.25  Pr@90: 20.81
+[2025-03-03 08:31:52,535] torch.distributed.elastic.agent.server.api: [WARNING] Received Signals.SIGINT death signal, shutting down workers
+[2025-03-03 08:31:52,536] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 2017036 closing signal SIGINT
+[2025-03-03 08:31:52,536] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 2017037 closing signal SIGINT
+[2025-03-03 08:31:52,536] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 2017038 closing signal SIGINT
+[2025-03-03 08:31:52,536] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 2017039 closing signal SIGINT
+Exception in thread Thread-24:
+Traceback (most recent call last):
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/threading.py", line 980, in _bootstrap_inner
+    self.run()
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/threading.py", line 917, in run
+    self._target(*self._args, **self._kwargs)
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/torch/utils/data/_utils/pin_memory.py", line 54, in _pin_memory_loop
+    do_one_step()
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/torch/utils/data/_utils/pin_memory.py", line 31, in do_one_step
+    r = in_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/multiprocessing/queues.py", line 122, in get
+    return _ForkingPickler.loads(res)
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/torch/multiprocessing/reductions.py", line 355, in rebuild_storage_fd
+Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7fa02edd68b0>
+Traceback (most recent call last):
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1478, in __del__
+Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f6b5cf638b0>
+Traceback (most recent call last):
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1478, in __del__
+Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f4c191bd8b0>
+Traceback (most recent call last):
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1478, in __del__
+[2025-03-03 08:31:52,707] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 2017036 closing signal SIGTERM
+[2025-03-03 08:31:52,707] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 2017037 closing signal SIGTERM
+[2025-03-03 08:31:52,708] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 2017038 closing signal SIGTERM
+[2025-03-03 08:31:52,708] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 2017039 closing signal SIGTERM
+Traceback (most recent call last):
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/torch/distributed/elastic/agent/server/api.py", line 736, in run
+    result = self._invoke_run(role)
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/torch/distributed/elastic/agent/server/api.py", line 877, in _invoke_run
+    time.sleep(monitor_interval)
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 62, in _terminate_process_handler
+    raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval)
+torch.distributed.elastic.multiprocessing.api.SignalException: Process 2017022 got signal: 2
+During handling of the above exception, another exception occurred:
+Traceback (most recent call last):
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/torch/distributed/elastic/agent/server/api.py", line 743, in run
+    self._shutdown(e.sigval)
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/torch/distributed/elastic/agent/server/local_elastic_agent.py", line 289, in _shutdown
+    self._pcontext.close(death_sig)
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 331, in close
+    self._close(death_sig=death_sig, timeout=timeout)
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 713, in _close
+    handler.proc.wait(time_to_wait)
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/subprocess.py", line 1189, in wait
+    return self._wait(timeout=timeout)
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/subprocess.py", line 1927, in _wait
+    time.sleep(delay)
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 62, in _terminate_process_handler
+    raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval)
+torch.distributed.elastic.multiprocessing.api.SignalException: Process 2017022 got signal: 2
+During handling of the above exception, another exception occurred:
+Traceback (most recent call last):
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/runpy.py", line 197, in _run_module_as_main
+    return _run_code(code, main_globals, None,
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/runpy.py", line 87, in _run_code
+    exec(code, run_globals)
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/torch/distributed/launch.py", line 196, in <module>
+    main()
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/torch/distributed/launch.py", line 192, in main
+    launch(args)
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/torch/distributed/launch.py", line 177, in launch
+    run(args)
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/torch/distributed/run.py", line 797, in run
+    elastic_launch(
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 134, in __call__
+    return launch_agent(self._config, self._entrypoint, list(args))
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 255, in launch_agent
+    result = agent.run()
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/torch/distributed/elastic/metrics/api.py", line 124, in wrapper
+    result = f(*args, **kwargs)
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/torch/distributed/elastic/agent/server/api.py", line 748, in run
+    self._shutdown()
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/torch/distributed/elastic/agent/server/local_elastic_agent.py", line 289, in _shutdown
+    self._pcontext.close(death_sig)
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 331, in close
+    self._close(death_sig=death_sig, timeout=timeout)
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 713, in _close
+    handler.proc.wait(time_to_wait)
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/subprocess.py", line 1189, in wait
+    return self._wait(timeout=timeout)
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/subprocess.py", line 1927, in _wait
+    time.sleep(delay)
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 62, in _terminate_process_handler
+    raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval)
+torch.distributed.elastic.multiprocessing.api.SignalException: Process 2017022 got signal: 2

CGFormer/bash_logs/ACE_filter050_rev.log ADDED Viewed

	@@ -0,0 +1,528 @@

+/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/torch/distributed/launch.py:181: FutureWarning: The module torch.distributed.launch is deprecated
+and will be removed in future. Use torchrun.
+Note that --use-env is set by default in torchrun.
+If your script expects `--local-rank` argument to be set, please
+change it to read from `os.environ['LOCAL_RANK']` instead. See
+https://pytorch.org/docs/stable/distributed.html#launch-utility for
+further instructions
+  warnings.warn(
+[2025-03-03 16:23:35,171] torch.distributed.run: [WARNING]
+[2025-03-03 16:23:35,171] torch.distributed.run: [WARNING] *****************************************
+[2025-03-03 16:23:35,171] torch.distributed.run: [WARNING] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
+[2025-03-03 16:23:35,171] torch.distributed.run: [WARNING] *****************************************
+/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/albumentations/__init__.py:24: UserWarning: A new version of Albumentations is available: 2.0.5 (you have 1.4.24). Upgrade using: pip install -U albumentations. To disable automatic update checks, set the environment variable NO_ALBUMENTATIONS_UPDATE to 1.
+  check_for_updates()
+/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/albumentations/__init__.py:24: UserWarning: A new version of Albumentations is available: 2.0.5 (you have 1.4.24). Upgrade using: pip install -U albumentations. To disable automatic update checks, set the environment variable NO_ALBUMENTATIONS_UPDATE to 1.
+  check_for_updates()
+/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/albumentations/__init__.py:24: UserWarning: A new version of Albumentations is available: 2.0.5 (you have 1.4.24). Upgrade using: pip install -U albumentations. To disable automatic update checks, set the environment variable NO_ALBUMENTATIONS_UPDATE to 1.
+  check_for_updates()
+/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/albumentations/__init__.py:24: UserWarning: A new version of Albumentations is available: 2.0.5 (you have 1.4.24). Upgrade using: pip install -U albumentations. To disable automatic update checks, set the environment variable NO_ALBUMENTATIONS_UPDATE to 1.
+  check_for_updates()
+/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/albumentations/__init__.py:24: UserWarning: A new version of Albumentations is available: 2.0.5 (you have 1.4.24). Upgrade using: pip install -U albumentations. To disable automatic update checks, set the environment variable NO_ALBUMENTATIONS_UPDATE to 1.
+  check_for_updates()
+/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/albumentations/__init__.py:24: UserWarning: A new version of Albumentations is available: 2.0.5 (you have 1.4.24). Upgrade using: pip install -U albumentations. To disable automatic update checks, set the environment variable NO_ALBUMENTATIONS_UPDATE to 1.
+  check_for_updates()
+/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/timm/models/layers/__init__.py:48: FutureWarning: Importing from timm.models.layers is deprecated, please import via timm.layers
+  warnings.warn(f"Importing from {__name__} is deprecated, please import via timm.layers", FutureWarning)
+/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/timm/models/layers/__init__.py:48: FutureWarning: Importing from timm.models.layers is deprecated, please import via timm.layers
+  warnings.warn(f"Importing from {__name__} is deprecated, please import via timm.layers", FutureWarning)
+/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/timm/models/layers/__init__.py:48: FutureWarning: Importing from timm.models.layers is deprecated, please import via timm.layers
+  warnings.warn(f"Importing from {__name__} is deprecated, please import via timm.layers", FutureWarning)
+/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/timm/models/layers/__init__.py:48: FutureWarning: Importing from timm.models.layers is deprecated, please import via timm.layers
+  warnings.warn(f"Importing from {__name__} is deprecated, please import via timm.layers", FutureWarning)
+/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/timm/models/layers/__init__.py:48: FutureWarning: Importing from timm.models.layers is deprecated, please import via timm.layers
+  warnings.warn(f"Importing from {__name__} is deprecated, please import via timm.layers", FutureWarning)
+/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/timm/models/layers/__init__.py:48: FutureWarning: Importing from timm.models.layers is deprecated, please import via timm.layers
+  warnings.warn(f"Importing from {__name__} is deprecated, please import via timm.layers", FutureWarning)
+2025-03-03 16:24:00.550 | INFO     | __main__:main:66 - LOCAL_RANK from env: 2
+2025-03-03 16:24:00.550 | INFO     | __main__:main:66 - LOCAL_RANK from env: 5
+2025-03-03 16:24:00.551 | INFO     | __main__:main:66 - LOCAL_RANK from env: 4
+2025-03-03 16:24:00.551 | INFO     | __main__:main:66 - LOCAL_RANK from env: 0
+2025-03-03 16:24:00.551 | INFO     | __main__:main:66 - LOCAL_RANK from env: 1
+2025-03-03 16:24:00.551 | INFO     | __main__:main:66 - LOCAL_RANK from env: 3
+2025-03-03 16:24:00 | INFO     | __main__:90 - Starting with GPU: 0, Rank: 0, World Size: 6
+git root error: Cmd('git') failed due to: exit code(128)
+  cmdline: git rev-parse --show-toplevel
+  stderr: 'fatal: detected dubious ownership in repository at '/data2/projects/chaeyun/CGFormer'
+To add an exception for this directory, call:
+	git config --global --add safe.directory /data2/projects/chaeyun/CGFormer'
+wandb: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
+wandb: Tracking run with wandb version 0.19.1
+wandb: W&B syncing is set to `offline` in this directory.
+wandb: Run `wandb online` or set WANDB_MODE=online to enable cloud syncing.
+node03:2316571:2316571 [0] NCCL INFO Bootstrap : Using eth2:10.1.10.3<0>
+node03:2316571:2316571 [0] NCCL INFO NET/Plugin : Plugin load (libnccl-net.so) returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory
+node03:2316571:2316571 [0] NCCL INFO NET/Plugin : No plugin found, using internal implementation
+node03:2316571:2316571 [0] NCCL INFO cudaDriverVersion 12070
+NCCL version 2.18.5+cuda11.8
+node03:2316574:2316574 [3] NCCL INFO cudaDriverVersion 12070
+node03:2316575:2316575 [4] NCCL INFO cudaDriverVersion 12070
+node03:2316572:2316572 [1] NCCL INFO cudaDriverVersion 12070
+node03:2316576:2316576 [5] NCCL INFO cudaDriverVersion 12070
+node03:2316573:2316573 [2] NCCL INFO cudaDriverVersion 12070
+node03:2316575:2316575 [4] NCCL INFO Bootstrap : Using eth2:10.1.10.3<0>
+node03:2316574:2316574 [3] NCCL INFO Bootstrap : Using eth2:10.1.10.3<0>
+node03:2316576:2316576 [5] NCCL INFO Bootstrap : Using eth2:10.1.10.3<0>
+node03:2316572:2316572 [1] NCCL INFO Bootstrap : Using eth2:10.1.10.3<0>
+node03:2316573:2316573 [2] NCCL INFO Bootstrap : Using eth2:10.1.10.3<0>
+node03:2316575:2316575 [4] NCCL INFO NET/Plugin : Plugin load (libnccl-net.so) returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory
+node03:2316575:2316575 [4] NCCL INFO NET/Plugin : No plugin found, using internal implementation
+node03:2316574:2316574 [3] NCCL INFO NET/Plugin : Plugin load (libnccl-net.so) returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory
+node03:2316574:2316574 [3] NCCL INFO NET/Plugin : No plugin found, using internal implementation
+node03:2316572:2316572 [1] NCCL INFO NET/Plugin : Plugin load (libnccl-net.so) returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory
+node03:2316572:2316572 [1] NCCL INFO NET/Plugin : No plugin found, using internal implementation
+node03:2316576:2316576 [5] NCCL INFO NET/Plugin : Plugin load (libnccl-net.so) returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory
+node03:2316573:2316573 [2] NCCL INFO NET/Plugin : Plugin load (libnccl-net.so) returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory
+node03:2316576:2316576 [5] NCCL INFO NET/Plugin : No plugin found, using internal implementation
+node03:2316573:2316573 [2] NCCL INFO NET/Plugin : No plugin found, using internal implementation
+node03:2316571:2316708 [0] NCCL INFO NET/IB : No device found.
+node03:2316573:2316712 [2] NCCL INFO NET/IB : No device found.
+node03:2316571:2316708 [0] NCCL INFO NET/Socket : Using [0]eth2:10.1.10.3<0>
+node03:2316571:2316708 [0] NCCL INFO Using network Socket
+node03:2316573:2316712 [2] NCCL INFO NET/Socket : Using [0]eth2:10.1.10.3<0>
+node03:2316573:2316712 [2] NCCL INFO Using network Socket
+node03:2316574:2316710 [3] NCCL INFO NET/IB : No device found.
+node03:2316574:2316710 [3] NCCL INFO NET/Socket : Using [0]eth2:10.1.10.3<0>
+node03:2316574:2316710 [3] NCCL INFO Using network Socket
+node03:2316572:2316711 [1] NCCL INFO NET/IB : No device found.
+node03:2316572:2316711 [1] NCCL INFO NET/Socket : Using [0]eth2:10.1.10.3<0>
+node03:2316572:2316711 [1] NCCL INFO Using network Socket
+node03:2316576:2316713 [5] NCCL INFO NET/IB : No device found.
+node03:2316576:2316713 [5] NCCL INFO NET/Socket : Using [0]eth2:10.1.10.3<0>
+node03:2316576:2316713 [5] NCCL INFO Using network Socket
+node03:2316575:2316709 [4] NCCL INFO NET/IB : No device found.
+node03:2316575:2316709 [4] NCCL INFO NET/Socket : Using [0]eth2:10.1.10.3<0>
+node03:2316575:2316709 [4] NCCL INFO Using network Socket
+node03:2316571:2316708 [0] NCCL INFO comm 0xa205d10 rank 0 nranks 6 cudaDev 0 nvmlDev 0 busId 12000 commId 0x5c97a0f7f601b696 - Init START
+node03:2316573:2316712 [2] NCCL INFO comm 0xac1c200 rank 2 nranks 6 cudaDev 2 nvmlDev 2 busId 14000 commId 0x5c97a0f7f601b696 - Init START
+node03:2316576:2316713 [5] NCCL INFO comm 0x9b6ede0 rank 5 nranks 6 cudaDev 5 nvmlDev 5 busId c1000 commId 0x5c97a0f7f601b696 - Init START
+node03:2316572:2316711 [1] NCCL INFO comm 0xa155230 rank 1 nranks 6 cudaDev 1 nvmlDev 1 busId 13000 commId 0x5c97a0f7f601b696 - Init START
+node03:2316575:2316709 [4] NCCL INFO comm 0xa46f6f0 rank 4 nranks 6 cudaDev 4 nvmlDev 4 busId c0000 commId 0x5c97a0f7f601b696 - Init START
+node03:2316574:2316710 [3] NCCL INFO comm 0xadbe390 rank 3 nranks 6 cudaDev 3 nvmlDev 3 busId 48000 commId 0x5c97a0f7f601b696 - Init START
+node03:2316573:2316712 [2] NCCL INFO Setting affinity for GPU 2 to 14005500,00140055
+node03:2316572:2316711 [1] NCCL INFO Setting affinity for GPU 1 to 14005500,00140055
+node03:2316574:2316710 [3] NCCL INFO Setting affinity for GPU 3 to 14005500,00140055
+node03:2316571:2316708 [0] NCCL INFO Setting affinity for GPU 0 to 14005500,00140055
+node03:2316571:2316708 [0] NCCL INFO Channel 00/02 :    0   1   2   3   4   5
+node03:2316575:2316709 [4] NCCL INFO Trees [0] 5/-1/-1->4->3 [1] 5/-1/-1->4->3
+node03:2316572:2316711 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0
+node03:2316574:2316710 [3] NCCL INFO Trees [0] 4/-1/-1->3->2 [1] 4/-1/-1->3->2
+node03:2316573:2316712 [2] NCCL INFO Trees [0] 3/-1/-1->2->1 [1] 3/-1/-1->2->1
+node03:2316571:2316708 [0] NCCL INFO Channel 01/02 :    0   1   2   3   4   5
+node03:2316576:2316713 [5] NCCL INFO Trees [0] -1/-1/-1->5->4 [1] -1/-1/-1->5->4
+node03:2316575:2316709 [4] NCCL INFO P2P Chunksize set to 131072
+node03:2316572:2316711 [1] NCCL INFO P2P Chunksize set to 131072
+node03:2316574:2316710 [3] NCCL INFO P2P Chunksize set to 131072
+node03:2316573:2316712 [2] NCCL INFO P2P Chunksize set to 131072
+node03:2316571:2316708 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1
+node03:2316576:2316713 [5] NCCL INFO P2P Chunksize set to 131072
+node03:2316571:2316708 [0] NCCL INFO P2P Chunksize set to 131072
+node03:2316572:2316711 [1] NCCL INFO Channel 00/0 : 1[1] -> 2[2] via P2P/IPC
+node03:2316572:2316711 [1] NCCL INFO Channel 01/0 : 1[1] -> 2[2] via P2P/IPC
+node03:2316576:2316713 [5] NCCL INFO Channel 00 : 5[5] -> 0[0] via SHM/direct/direct
+node03:2316573:2316712 [2] NCCL INFO Channel 00 : 2[2] -> 3[3] via SHM/direct/direct
+node03:2316576:2316713 [5] NCCL INFO Channel 01 : 5[5] -> 0[0] via SHM/direct/direct
+node03:2316573:2316712 [2] NCCL INFO Channel 01 : 2[2] -> 3[3] via SHM/direct/direct
+node03:2316575:2316709 [4] NCCL INFO Channel 00/0 : 4[4] -> 5[5] via P2P/IPC
+node03:2316575:2316709 [4] NCCL INFO Channel 01/0 : 4[4] -> 5[5] via P2P/IPC
+node03:2316571:2316708 [0] NCCL INFO Channel 00/0 : 0[0] -> 1[1] via P2P/IPC
+node03:2316574:2316710 [3] NCCL INFO Channel 00 : 3[3] -> 4[4] via SHM/direct/direct
+node03:2316571:2316708 [0] NCCL INFO Channel 01/0 : 0[0] -> 1[1] via P2P/IPC
+node03:2316574:2316710 [3] NCCL INFO Channel 01 : 3[3] -> 4[4] via SHM/direct/direct
+node03:2316575:2316709 [4] NCCL INFO Connected all rings
+node03:2316572:2316711 [1] NCCL INFO Connected all rings
+node03:2316571:2316708 [0] NCCL INFO Connected all rings
+node03:2316573:2316712 [2] NCCL INFO Connected all rings
+node03:2316576:2316713 [5] NCCL INFO Connected all rings
+node03:2316576:2316713 [5] NCCL INFO Channel 00/0 : 5[5] -> 4[4] via P2P/IPC
+node03:2316572:2316711 [1] NCCL INFO Channel 00/0 : 1[1] -> 0[0] via P2P/IPC
+node03:2316575:2316709 [4] NCCL INFO Channel 00 : 4[4] -> 3[3] via SHM/direct/direct
+node03:2316574:2316710 [3] NCCL INFO Connected all rings
+node03:2316572:2316711 [1] NCCL INFO Channel 01/0 : 1[1] -> 0[0] via P2P/IPC
+node03:2316576:2316713 [5] NCCL INFO Channel 01/0 : 5[5] -> 4[4] via P2P/IPC
+node03:2316575:2316709 [4] NCCL INFO Channel 01 : 4[4] -> 3[3] via SHM/direct/direct
+node03:2316571:2316708 [0] NCCL INFO Connected all trees
+node03:2316571:2316708 [0] NCCL INFO threadThresholds 8/8/64 | 48/8/64 | 512 | 512
+node03:2316571:2316708 [0] NCCL INFO 2 coll channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
+node03:2316576:2316713 [5] NCCL INFO Connected all trees
+node03:2316576:2316713 [5] NCCL INFO threadThresholds 8/8/64 | 48/8/64 | 512 | 512
+node03:2316576:2316713 [5] NCCL INFO 2 coll channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
+node03:2316573:2316712 [2] NCCL INFO Channel 00/0 : 2[2] -> 1[1] via P2P/IPC
+node03:2316573:2316712 [2] NCCL INFO Channel 01/0 : 2[2] -> 1[1] via P2P/IPC
+node03:2316572:2316711 [1] NCCL INFO Connected all trees
+node03:2316572:2316711 [1] NCCL INFO threadThresholds 8/8/64 | 48/8/64 | 512 | 512
+node03:2316572:2316711 [1] NCCL INFO 2 coll channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
+node03:2316574:2316710 [3] NCCL INFO Channel 00 : 3[3] -> 2[2] via SHM/direct/direct
+node03:2316574:2316710 [3] NCCL INFO Channel 01 : 3[3] -> 2[2] via SHM/direct/direct
+node03:2316573:2316712 [2] NCCL INFO Connected all trees
+node03:2316573:2316712 [2] NCCL INFO threadThresholds 8/8/64 | 48/8/64 | 512 | 512
+node03:2316573:2316712 [2] NCCL INFO 2 coll channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
+node03:2316575:2316709 [4] NCCL INFO Connected all trees
+node03:2316575:2316709 [4] NCCL INFO threadThresholds 8/8/64 | 48/8/64 | 512 | 512
+node03:2316575:2316709 [4] NCCL INFO 2 coll channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
+node03:2316574:2316710 [3] NCCL INFO Connected all trees
+node03:2316574:2316710 [3] NCCL INFO threadThresholds 8/8/64 | 48/8/64 | 512 | 512
+node03:2316574:2316710 [3] NCCL INFO 2 coll channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer
+node03:2316575:2316709 [4] NCCL INFO comm 0xa46f6f0 rank 4 nranks 6 cudaDev 4 nvmlDev 4 busId c0000 commId 0x5c97a0f7f601b696 - Init COMPLETE
+node03:2316573:2316712 [2] NCCL INFO comm 0xac1c200 rank 2 nranks 6 cudaDev 2 nvmlDev 2 busId 14000 commId 0x5c97a0f7f601b696 - Init COMPLETE
+node03:2316571:2316708 [0] NCCL INFO comm 0xa205d10 rank 0 nranks 6 cudaDev 0 nvmlDev 0 busId 12000 commId 0x5c97a0f7f601b696 - Init COMPLETE
+node03:2316572:2316711 [1] NCCL INFO comm 0xa155230 rank 1 nranks 6 cudaDev 1 nvmlDev 1 busId 13000 commId 0x5c97a0f7f601b696 - Init COMPLETE
+node03:2316574:2316710 [3] NCCL INFO comm 0xadbe390 rank 3 nranks 6 cudaDev 3 nvmlDev 3 busId 48000 commId 0x5c97a0f7f601b696 - Init COMPLETE
+node03:2316576:2316713 [5] NCCL INFO comm 0x9b6ede0 rank 5 nranks 6 cudaDev 5 nvmlDev 5 busId c1000 commId 0x5c97a0f7f601b696 - Init COMPLETE
+2025-03-03 16:24:03 | INFO     | model:31 - Window size 12!
+2025-03-03 16:24:03 | INFO     | model:51 - Initializing Multi-modal Swin Transformer weights from ckpts/swin_base_patch4_window12_384_22k.pth
+2025-03-03 16:24:05 | INFO     | model.backbone:459 - loading swin success !!!
+2025-03-03 16:24:08 | INFO     | __main__:144 - Model moved to GPU: 0
+2025-03-03 16:24:08 | INFO     | __main__:145 - amsgrad: True
+batch_size: 30
+batch_size_val: 16
+bert: bert-base-uncased
+dataset: refcocog_u
+dist_backend: nccl
+dropout: 0.0
+epochs: 50
+evaluate: True
+exclude_multiobj: True
+exp_name: ACE_filter050_rev
+filter_threshold: 0.5
+fusion_drop: 0.0
+gpu: 0
+hp_selection: strict
+input_size: 480
+local_rank: 0
+loss_option: ACE_verbonly
+lr: 0.0001
+lr_backbone: 5e-05
+lr_text_encoder: 5e-05
+manual_seed: 1455390217
+margin_value: 12
+mask_root: data/masks/refcocog_u
+metric_learning: True
+metric_loss_weight: 0.1
+metric_mode: hardpos_only_sbertsim_refined
+mha: 8-8-8-8
+mixup_lasttwo: False
+num_token: 2
+output_dir: exp/refcoco_u/ACE_filter050_rev
+output_folder: exp/refcoco_u
+print_freq: 100
+rank: 0
+resume: None
+save_freq: 1
+start_epoch: 0
+swin_pretrain: ckpts/swin_base_patch4_window12_384_22k.pth
+swin_type: base
+sync_bn: True
+temperature: 0.07
+test_lmdb: data/lmdb/refcocog_u/test.lmdb
+test_split: test
+token_dim: 512
+train_lmdb: data/lmdb/refcocog_u/train.lmdb
+train_split: train
+val_lmdb: data/lmdb/refcocog_u/val.lmdb
+val_split: val
+vis_dim: 512
+visualize: False
+weight: None
+weight_decay: 0.0001
+window12: True
+word_dim: 768
+word_len: 20
+workers: 32
+workers_val: 8
+world_size: 6
+2025-03-03 16:26:29 | INFO     | utils.misc:108 - Training: Epoch=[1/50] [ 100/1407]  Batch=1.30 (1.38)  Data=0.00 (0.07)  Lr=0.000100  Loss=1.0907 (1.1761)  IoU=26.22 (19.83)  Prec@50=23.61 (8.67)
+2025-03-03 16:28:41 | INFO     | utils.misc:108 - Training: Epoch=[1/50] [ 200/1407]  Batch=1.29 (1.35)  Data=0.00 (0.05)  Lr=0.000100  Loss=0.9969 (1.0899)  IoU=29.22 (24.57)  Prec@50=25.56 (13.58)
+2025-03-03 16:30:54 | INFO     | utils.misc:108 - Training: Epoch=[1/50] [ 300/1407]  Batch=1.36 (1.34)  Data=0.00 (0.04)  Lr=0.000100  Loss=0.9088 (1.0496)  IoU=35.17 (26.91)  Prec@50=21.79 (16.03)
+2025-03-03 16:33:06 | INFO     | utils.misc:108 - Training: Epoch=[1/50] [ 400/1407]  Batch=1.22 (1.34)  Data=0.00 (0.04)  Lr=0.000100  Loss=0.8703 (1.0167)  IoU=35.01 (28.40)  Prec@50=21.29 (17.80)
+2025-03-03 16:35:19 | INFO     | utils.misc:108 - Training: Epoch=[1/50] [ 500/1407]  Batch=1.15 (1.34)  Data=0.00 (0.04)  Lr=0.000100  Loss=0.9754 (0.9922)  IoU=31.02 (29.64)  Prec@50=17.78 (19.37)
+2025-03-03 16:37:31 | INFO     | utils.misc:108 - Training: Epoch=[1/50] [ 600/1407]  Batch=1.16 (1.33)  Data=0.00 (0.04)  Lr=0.000100  Loss=0.8838 (0.9712)  IoU=33.06 (30.63)  Prec@50=26.75 (20.87)
+2025-03-03 16:39:43 | INFO     | utils.misc:108 - Training: Epoch=[1/50] [ 700/1407]  Batch=1.24 (1.33)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.7971 (0.9545)  IoU=34.82 (31.32)  Prec@50=30.06 (21.88)
+2025-03-03 16:41:53 | INFO     | utils.misc:108 - Training: Epoch=[1/50] [ 800/1407]  Batch=1.48 (1.33)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.9076 (0.9414)  IoU=35.91 (31.87)  Prec@50=23.91 (22.66)
+2025-03-03 16:44:04 | INFO     | utils.misc:108 - Training: Epoch=[1/50] [ 900/1407]  Batch=1.29 (1.32)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.9933 (0.9301)  IoU=30.51 (32.29)  Prec@50=18.81 (23.24)
+2025-03-03 16:46:13 | INFO     | utils.misc:108 - Training: Epoch=[1/50] [1000/1407]  Batch=1.26 (1.32)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.7584 (0.9199)  IoU=40.61 (32.82)  Prec@50=30.95 (23.96)
+2025-03-03 16:48:24 | INFO     | utils.misc:108 - Training: Epoch=[1/50] [1100/1407]  Batch=1.69 (1.32)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.8691 (0.9110)  IoU=35.25 (33.29)  Prec@50=19.97 (24.69)
+2025-03-03 16:50:37 | INFO     | utils.misc:108 - Training: Epoch=[1/50] [1200/1407]  Batch=1.39 (1.32)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.8363 (0.9033)  IoU=34.11 (33.67)  Prec@50=28.97 (25.27)
+2025-03-03 16:52:49 | INFO     | utils.misc:108 - Training: Epoch=[1/50] [1300/1407]  Batch=1.35 (1.32)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.8169 (0.8959)  IoU=34.20 (34.05)  Prec@50=26.85 (25.87)
+2025-03-03 16:55:00 | INFO     | utils.misc:108 - Training: Epoch=[1/50] [1400/1407]  Batch=1.09 (1.32)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.7557 (0.8885)  IoU=44.41 (34.49)  Prec@50=41.27 (26.50)
+2025-03-03 16:55:45 | INFO     | engine.engine_gref:166 - Evaluation: Epoch=[1/50]  mIoU=44.49  oIoU=42.87  Pr@50: 40.95  Pr@60: 30.46  Pr@70: 20.32  Pr@80: 11.81  Pr@90: 2.80
+2025-03-03 16:58:21 | INFO     | utils.misc:108 - Training: Epoch=[2/50] [ 100/1407]  Batch=1.28 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.9049 (0.7654)  IoU=35.26 (41.03)  Prec@50=29.66 (36.59)
+2025-03-03 17:00:37 | INFO     | utils.misc:108 - Training: Epoch=[2/50] [ 200/1407]  Batch=1.68 (1.33)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.7241 (0.7458)  IoU=40.81 (41.92)  Prec@50=42.92 (38.62)
+2025-03-03 17:02:47 | INFO     | utils.misc:108 - Training: Epoch=[2/50] [ 300/1407]  Batch=1.42 (1.32)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.8373 (0.7440)  IoU=40.21 (42.04)  Prec@50=32.79 (38.73)
+2025-03-03 17:04:56 | INFO     | utils.misc:108 - Training: Epoch=[2/50] [ 400/1407]  Batch=1.33 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.8551 (0.7431)  IoU=35.71 (41.98)  Prec@50=34.74 (38.65)
+2025-03-03 17:07:05 | INFO     | utils.misc:108 - Training: Epoch=[2/50] [ 500/1407]  Batch=1.56 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.7449 (0.7434)  IoU=44.00 (41.89)  Prec@50=46.22 (38.68)
+2025-03-03 17:09:14 | INFO     | utils.misc:108 - Training: Epoch=[2/50] [ 600/1407]  Batch=1.31 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.7519 (0.7414)  IoU=38.65 (41.92)  Prec@50=30.89 (38.80)
+2025-03-03 17:11:26 | INFO     | utils.misc:108 - Training: Epoch=[2/50] [ 700/1407]  Batch=1.22 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.6646 (0.7398)  IoU=43.75 (42.04)  Prec@50=41.87 (39.10)
+2025-03-03 17:13:36 | INFO     | utils.misc:108 - Training: Epoch=[2/50] [ 800/1407]  Batch=1.24 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.7340 (0.7378)  IoU=42.55 (42.15)  Prec@50=39.05 (39.38)
+2025-03-03 17:15:44 | INFO     | utils.misc:108 - Training: Epoch=[2/50] [ 900/1407]  Batch=1.77 (1.30)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.6731 (0.7369)  IoU=43.04 (42.09)  Prec@50=48.61 (39.34)
+2025-03-03 17:17:53 | INFO     | utils.misc:108 - Training: Epoch=[2/50] [1000/1407]  Batch=1.36 (1.30)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.6417 (0.7343)  IoU=45.02 (42.19)  Prec@50=51.01 (39.55)
+2025-03-03 17:20:04 | INFO     | utils.misc:108 - Training: Epoch=[2/50] [1100/1407]  Batch=1.30 (1.30)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.8503 (0.7329)  IoU=38.33 (42.24)  Prec@50=35.36 (39.68)
+2025-03-03 17:22:15 | INFO     | utils.misc:108 - Training: Epoch=[2/50] [1200/1407]  Batch=1.62 (1.30)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.7639 (0.7322)  IoU=45.07 (42.29)  Prec@50=34.68 (39.71)
+2025-03-03 17:24:27 | INFO     | utils.misc:108 - Training: Epoch=[2/50] [1300/1407]  Batch=1.17 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.8485 (0.7314)  IoU=36.88 (42.42)  Prec@50=38.10 (39.84)
+2025-03-03 17:26:37 | INFO     | utils.misc:108 - Training: Epoch=[2/50] [1400/1407]  Batch=1.24 (1.30)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.8243 (0.7302)  IoU=44.98 (42.64)  Prec@50=41.79 (40.17)
+2025-03-03 17:27:21 | INFO     | engine.engine_gref:166 - Evaluation: Epoch=[2/50]  mIoU=50.35  oIoU=48.08  Pr@50: 51.28  Pr@60: 40.60  Pr@70: 30.92  Pr@80: 19.46  Pr@90: 6.02
+2025-03-03 17:29:59 | INFO     | utils.misc:108 - Training: Epoch=[3/50] [ 100/1407]  Batch=1.22 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.6662 (0.6521)  IoU=43.34 (48.06)  Prec@50=45.54 (48.24)
+2025-03-03 17:32:11 | INFO     | utils.misc:108 - Training: Epoch=[3/50] [ 200/1407]  Batch=1.32 (1.32)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.6270 (0.6581)  IoU=50.53 (47.15)  Prec@50=58.53 (47.12)
+2025-03-03 17:34:21 | INFO     | utils.misc:108 - Training: Epoch=[3/50] [ 300/1407]  Batch=1.13 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.7448 (0.6597)  IoU=42.87 (46.97)  Prec@50=36.43 (46.95)
+2025-03-03 17:36:33 | INFO     | utils.misc:108 - Training: Epoch=[3/50] [ 400/1407]  Batch=1.39 (1.31)  Data=0.01 (0.03)  Lr=0.000100  Loss=0.6364 (0.6586)  IoU=41.82 (46.78)  Prec@50=42.46 (46.52)
+2025-03-03 17:38:45 | INFO     | utils.misc:108 - Training: Epoch=[3/50] [ 500/1407]  Batch=1.40 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.7957 (0.6597)  IoU=43.02 (46.63)  Prec@50=32.08 (46.34)
+2025-03-03 17:40:56 | INFO     | utils.misc:108 - Training: Epoch=[3/50] [ 600/1407]  Batch=1.14 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.7063 (0.6615)  IoU=44.00 (46.61)  Prec@50=39.84 (46.29)
+2025-03-03 17:43:06 | INFO     | utils.misc:108 - Training: Epoch=[3/50] [ 700/1407]  Batch=1.13 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.5882 (0.6644)  IoU=54.05 (46.60)  Prec@50=57.14 (46.15)
+2025-03-03 17:45:16 | INFO     | utils.misc:108 - Training: Epoch=[3/50] [ 800/1407]  Batch=1.26 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.6320 (0.6651)  IoU=48.72 (46.60)  Prec@50=47.24 (46.12)
+2025-03-03 17:47:27 | INFO     | utils.misc:108 - Training: Epoch=[3/50] [ 900/1407]  Batch=1.23 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.7309 (0.6651)  IoU=44.48 (46.71)  Prec@50=40.69 (46.31)
+2025-03-03 17:49:35 | INFO     | utils.misc:108 - Training: Epoch=[3/50] [1000/1407]  Batch=1.18 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.8106 (0.6643)  IoU=33.29 (46.83)  Prec@50=28.57 (46.37)
+2025-03-03 17:51:46 | INFO     | utils.misc:108 - Training: Epoch=[3/50] [1100/1407]  Batch=1.32 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.6855 (0.6630)  IoU=43.40 (46.90)  Prec@50=45.09 (46.52)
+2025-03-03 17:53:58 | INFO     | utils.misc:108 - Training: Epoch=[3/50] [1200/1407]  Batch=1.25 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.6270 (0.6619)  IoU=51.54 (47.00)  Prec@50=47.42 (46.63)
+2025-03-03 17:56:09 | INFO     | utils.misc:108 - Training: Epoch=[3/50] [1300/1407]  Batch=1.38 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.6188 (0.6614)  IoU=48.14 (47.06)  Prec@50=53.94 (46.75)
+2025-03-03 17:58:19 | INFO     | utils.misc:108 - Training: Epoch=[3/50] [1400/1407]  Batch=1.15 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.6312 (0.6612)  IoU=53.97 (47.06)  Prec@50=59.68 (46.75)
+2025-03-03 17:59:03 | INFO     | engine.engine_gref:166 - Evaluation: Epoch=[3/50]  mIoU=54.39  oIoU=51.28  Pr@50: 57.42  Pr@60: 48.45  Pr@70: 39.20  Pr@80: 27.04  Pr@90: 10.96
+2025-03-03 18:01:40 | INFO     | utils.misc:108 - Training: Epoch=[4/50] [ 100/1407]  Batch=1.23 (1.32)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.7058 (0.6106)  IoU=38.22 (48.27)  Prec@50=32.74 (49.35)
+2025-03-03 18:03:51 | INFO     | utils.misc:108 - Training: Epoch=[4/50] [ 200/1407]  Batch=1.35 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.5542 (0.6165)  IoU=43.66 (47.87)  Prec@50=45.73 (48.88)
+2025-03-03 18:06:01 | INFO     | utils.misc:108 - Training: Epoch=[4/50] [ 300/1407]  Batch=1.51 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.5978 (0.6161)  IoU=47.81 (48.07)  Prec@50=44.17 (49.12)
+2025-03-03 18:08:11 | INFO     | utils.misc:108 - Training: Epoch=[4/50] [ 400/1407]  Batch=1.11 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.7430 (0.6135)  IoU=45.09 (48.32)  Prec@50=44.84 (49.50)
+2025-03-03 18:10:22 | INFO     | utils.misc:108 - Training: Epoch=[4/50] [ 500/1407]  Batch=1.27 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.6319 (0.6166)  IoU=49.79 (48.05)  Prec@50=47.02 (49.14)
+2025-03-03 18:12:33 | INFO     | utils.misc:108 - Training: Epoch=[4/50] [ 600/1407]  Batch=1.59 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.6365 (0.6162)  IoU=41.68 (48.00)  Prec@50=35.05 (48.94)
+2025-03-03 18:14:46 | INFO     | utils.misc:108 - Training: Epoch=[4/50] [ 700/1407]  Batch=1.23 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.5867 (0.6171)  IoU=53.39 (47.94)  Prec@50=57.92 (48.86)
+2025-03-03 18:16:55 | INFO     | utils.misc:108 - Training: Epoch=[4/50] [ 800/1407]  Batch=1.62 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.6259 (0.6161)  IoU=45.88 (48.17)  Prec@50=48.51 (49.16)
+2025-03-03 18:19:05 | INFO     | utils.misc:108 - Training: Epoch=[4/50] [ 900/1407]  Batch=1.24 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.6503 (0.6170)  IoU=50.25 (48.31)  Prec@50=51.11 (49.28)
+2025-03-03 18:21:17 | INFO     | utils.misc:108 - Training: Epoch=[4/50] [1000/1407]  Batch=1.16 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.6409 (0.6158)  IoU=48.29 (48.45)  Prec@50=50.24 (49.47)
+2025-03-03 18:23:27 | INFO     | utils.misc:108 - Training: Epoch=[4/50] [1100/1407]  Batch=1.28 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.6323 (0.6145)  IoU=50.15 (48.53)  Prec@50=51.49 (49.55)
+2025-03-03 18:25:37 | INFO     | utils.misc:108 - Training: Epoch=[4/50] [1200/1407]  Batch=1.22 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.5679 (0.6129)  IoU=55.70 (48.67)  Prec@50=61.69 (49.72)
+2025-03-03 18:27:50 | INFO     | utils.misc:108 - Training: Epoch=[4/50] [1300/1407]  Batch=1.25 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.6499 (0.6116)  IoU=50.86 (48.78)  Prec@50=50.87 (49.90)
+2025-03-03 18:30:01 | INFO     | utils.misc:108 - Training: Epoch=[4/50] [1400/1407]  Batch=1.18 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.7374 (0.6114)  IoU=39.56 (48.79)  Prec@50=34.68 (49.92)
+2025-03-03 18:30:45 | INFO     | engine.engine_gref:166 - Evaluation: Epoch=[4/50]  mIoU=56.35  oIoU=53.31  Pr@50: 60.68  Pr@60: 51.48  Pr@70: 42.50  Pr@80: 30.30  Pr@90: 11.50
+2025-03-03 18:33:23 | INFO     | utils.misc:108 - Training: Epoch=[5/50] [ 100/1407]  Batch=1.23 (1.32)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.4113 (0.5488)  IoU=61.30 (51.36)  Prec@50=69.78 (53.66)
+2025-03-03 18:35:32 | INFO     | utils.misc:108 - Training: Epoch=[5/50] [ 200/1407]  Batch=1.35 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.4562 (0.5582)  IoU=59.50 (51.34)  Prec@50=66.80 (53.66)
+2025-03-03 18:37:45 | INFO     | utils.misc:108 - Training: Epoch=[5/50] [ 300/1407]  Batch=1.43 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.5283 (0.5580)  IoU=52.54 (51.87)  Prec@50=57.52 (54.45)
+2025-03-03 18:39:56 | INFO     | utils.misc:108 - Training: Epoch=[5/50] [ 400/1407]  Batch=1.33 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.5587 (0.5559)  IoU=49.20 (52.23)  Prec@50=55.06 (54.90)
+2025-03-03 18:42:08 | INFO     | utils.misc:108 - Training: Epoch=[5/50] [ 500/1407]  Batch=1.13 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.6211 (0.5567)  IoU=50.09 (52.48)  Prec@50=54.76 (55.43)
+2025-03-03 18:44:19 | INFO     | utils.misc:108 - Training: Epoch=[5/50] [ 600/1407]  Batch=1.14 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.5935 (0.5572)  IoU=51.16 (52.40)  Prec@50=53.10 (55.33)
+2025-03-03 18:46:29 | INFO     | utils.misc:108 - Training: Epoch=[5/50] [ 700/1407]  Batch=1.20 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.6169 (0.5571)  IoU=55.58 (52.48)  Prec@50=61.11 (55.51)
+2025-03-03 18:48:40 | INFO     | utils.misc:108 - Training: Epoch=[5/50] [ 800/1407]  Batch=1.39 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.3962 (0.5566)  IoU=63.84 (52.62)  Prec@50=73.62 (55.67)
+2025-03-03 18:50:52 | INFO     | utils.misc:108 - Training: Epoch=[5/50] [ 900/1407]  Batch=1.23 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.5741 (0.5561)  IoU=51.11 (52.62)  Prec@50=49.78 (55.68)
+2025-03-03 18:53:04 | INFO     | utils.misc:108 - Training: Epoch=[5/50] [1000/1407]  Batch=1.30 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.5752 (0.5549)  IoU=46.45 (52.75)  Prec@50=45.34 (55.80)
+2025-03-03 18:55:17 | INFO     | utils.misc:108 - Training: Epoch=[5/50] [1100/1407]  Batch=1.49 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.5531 (0.5562)  IoU=54.64 (52.65)  Prec@50=56.15 (55.63)
+2025-03-03 18:57:30 | INFO     | utils.misc:108 - Training: Epoch=[5/50] [1200/1407]  Batch=1.23 (1.32)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.6240 (0.5565)  IoU=53.21 (52.64)  Prec@50=60.42 (55.60)
+2025-03-03 18:59:42 | INFO     | utils.misc:108 - Training: Epoch=[5/50] [1300/1407]  Batch=1.35 (1.32)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.5657 (0.5568)  IoU=57.00 (52.65)  Prec@50=65.97 (55.57)
+2025-03-03 19:01:53 | INFO     | utils.misc:108 - Training: Epoch=[5/50] [1400/1407]  Batch=1.10 (1.32)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.4952 (0.5573)  IoU=56.29 (52.65)  Prec@50=57.14 (55.56)
+2025-03-03 19:02:37 | INFO     | engine.engine_gref:166 - Evaluation: Epoch=[5/50]  mIoU=59.29  oIoU=56.00  Pr@50: 65.58  Pr@60: 57.50  Pr@70: 48.68  Pr@80: 36.48  Pr@90: 15.23
+2025-03-03 19:05:14 | INFO     | utils.misc:108 - Training: Epoch=[6/50] [ 100/1407]  Batch=1.34 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.4567 (0.4978)  IoU=56.26 (55.83)  Prec@50=58.94 (60.25)
+2025-03-03 19:07:25 | INFO     | utils.misc:108 - Training: Epoch=[6/50] [ 200/1407]  Batch=1.21 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.5726 (0.4995)  IoU=47.47 (55.42)  Prec@50=48.71 (60.07)
+2025-03-03 19:09:34 | INFO     | utils.misc:108 - Training: Epoch=[6/50] [ 300/1407]  Batch=1.32 (1.30)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.4761 (0.5059)  IoU=58.15 (55.06)  Prec@50=62.62 (59.25)
+2025-03-03 19:11:44 | INFO     | utils.misc:108 - Training: Epoch=[6/50] [ 400/1407]  Batch=1.23 (1.30)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.5243 (0.5106)  IoU=55.15 (54.66)  Prec@50=58.63 (58.77)
+2025-03-03 19:13:54 | INFO     | utils.misc:108 - Training: Epoch=[6/50] [ 500/1407]  Batch=1.13 (1.30)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.6256 (0.5109)  IoU=46.39 (54.44)  Prec@50=48.02 (58.53)
+2025-03-03 19:16:05 | INFO     | utils.misc:108 - Training: Epoch=[6/50] [ 600/1407]  Batch=1.45 (1.30)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.5675 (0.5104)  IoU=47.12 (54.51)  Prec@50=49.41 (58.52)
+2025-03-03 19:18:15 | INFO     | utils.misc:108 - Training: Epoch=[6/50] [ 700/1407]  Batch=1.13 (1.30)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.6829 (0.5110)  IoU=48.30 (54.50)  Prec@50=54.37 (58.45)
+2025-03-03 19:20:25 | INFO     | utils.misc:108 - Training: Epoch=[6/50] [ 800/1407]  Batch=1.24 (1.30)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.5415 (0.5114)  IoU=49.64 (54.42)  Prec@50=50.30 (58.29)
+2025-03-03 19:22:34 | INFO     | utils.misc:108 - Training: Epoch=[6/50] [ 900/1407]  Batch=1.28 (1.30)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.4806 (0.5120)  IoU=58.17 (54.46)  Prec@50=57.84 (58.30)
+2025-03-03 19:24:45 | INFO     | utils.misc:108 - Training: Epoch=[6/50] [1000/1407]  Batch=1.24 (1.30)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.5476 (0.5130)  IoU=46.38 (54.36)  Prec@50=48.41 (58.26)
+2025-03-03 19:26:54 | INFO     | utils.misc:108 - Training: Epoch=[6/50] [1100/1407]  Batch=1.13 (1.30)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.4327 (0.5143)  IoU=61.53 (54.18)  Prec@50=70.24 (58.03)
+2025-03-03 19:29:04 | INFO     | utils.misc:108 - Training: Epoch=[6/50] [1200/1407]  Batch=1.27 (1.30)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.4774 (0.5143)  IoU=55.10 (54.15)  Prec@50=55.14 (58.05)
+2025-03-03 19:31:12 | INFO     | utils.misc:108 - Training: Epoch=[6/50] [1300/1407]  Batch=1.40 (1.30)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.5230 (0.5145)  IoU=51.25 (54.14)  Prec@50=52.98 (58.04)
+2025-03-03 19:33:21 | INFO     | utils.misc:108 - Training: Epoch=[6/50] [1400/1407]  Batch=1.39 (1.30)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.6039 (0.5156)  IoU=48.91 (54.14)  Prec@50=46.36 (57.95)
+2025-03-03 19:34:05 | INFO     | engine.engine_gref:166 - Evaluation: Epoch=[6/50]  mIoU=60.83  oIoU=57.66  Pr@50: 67.29  Pr@60: 60.37  Pr@70: 51.44  Pr@80: 39.70  Pr@90: 18.07
+2025-03-03 19:36:44 | INFO     | utils.misc:108 - Training: Epoch=[7/50] [ 100/1407]  Batch=1.18 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.3694 (0.4535)  IoU=64.46 (57.97)  Prec@50=69.13 (63.46)
+2025-03-03 19:38:55 | INFO     | utils.misc:108 - Training: Epoch=[7/50] [ 200/1407]  Batch=1.22 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.4216 (0.4611)  IoU=56.11 (57.46)  Prec@50=58.08 (62.46)
+2025-03-03 19:41:04 | INFO     | utils.misc:108 - Training: Epoch=[7/50] [ 300/1407]  Batch=1.18 (1.30)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.3782 (0.4635)  IoU=61.55 (57.46)  Prec@50=69.13 (62.34)
+2025-03-03 19:43:16 | INFO     | utils.misc:108 - Training: Epoch=[7/50] [ 400/1407]  Batch=1.22 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.4767 (0.4658)  IoU=57.43 (57.24)  Prec@50=67.86 (61.95)
+2025-03-03 19:45:27 | INFO     | utils.misc:108 - Training: Epoch=[7/50] [ 500/1407]  Batch=1.17 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.4054 (0.4673)  IoU=62.52 (57.11)  Prec@50=69.68 (61.74)
+2025-03-03 19:47:38 | INFO     | utils.misc:108 - Training: Epoch=[7/50] [ 600/1407]  Batch=1.23 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.3967 (0.4709)  IoU=63.16 (56.86)  Prec@50=71.21 (61.39)
+2025-03-03 19:49:48 | INFO     | utils.misc:108 - Training: Epoch=[7/50] [ 700/1407]  Batch=1.22 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.4413 (0.4737)  IoU=62.03 (56.70)  Prec@50=80.79 (61.12)
+2025-03-03 19:51:58 | INFO     | utils.misc:108 - Training: Epoch=[7/50] [ 800/1407]  Batch=1.22 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.4080 (0.4749)  IoU=62.66 (56.63)  Prec@50=68.57 (61.05)
+2025-03-03 19:54:08 | INFO     | utils.misc:108 - Training: Epoch=[7/50] [ 900/1407]  Batch=1.15 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.4112 (0.4776)  IoU=60.53 (56.50)  Prec@50=67.70 (60.89)
+2025-03-03 19:56:20 | INFO     | utils.misc:108 - Training: Epoch=[7/50] [1000/1407]  Batch=1.39 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.4987 (0.4784)  IoU=61.67 (56.37)  Prec@50=65.00 (60.70)
+2025-03-03 19:58:31 | INFO     | utils.misc:108 - Training: Epoch=[7/50] [1100/1407]  Batch=1.27 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.3991 (0.4795)  IoU=56.89 (56.34)  Prec@50=59.13 (60.63)
+2025-03-03 20:00:42 | INFO     | utils.misc:108 - Training: Epoch=[7/50] [1200/1407]  Batch=1.59 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.3944 (0.4802)  IoU=61.46 (56.35)  Prec@50=64.31 (60.58)
+2025-03-03 20:02:55 | INFO     | utils.misc:108 - Training: Epoch=[7/50] [1300/1407]  Batch=1.24 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.6520 (0.4818)  IoU=52.79 (56.28)  Prec@50=57.54 (60.48)
+2025-03-03 20:05:06 | INFO     | utils.misc:108 - Training: Epoch=[7/50] [1400/1407]  Batch=1.35 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.5381 (0.4822)  IoU=54.06 (56.23)  Prec@50=57.18 (60.46)
+2025-03-03 20:05:51 | INFO     | engine.engine_gref:166 - Evaluation: Epoch=[7/50]  mIoU=62.05  oIoU=58.56  Pr@50: 69.35  Pr@60: 61.85  Pr@70: 53.96  Pr@80: 40.87  Pr@90: 19.00
+2025-03-03 20:08:27 | INFO     | utils.misc:108 - Training: Epoch=[8/50] [ 100/1407]  Batch=1.26 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.3871 (0.4439)  IoU=57.42 (57.00)  Prec@50=60.81 (62.43)
+2025-03-03 20:10:38 | INFO     | utils.misc:108 - Training: Epoch=[8/50] [ 200/1407]  Batch=1.12 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.4502 (0.4448)  IoU=59.06 (57.36)  Prec@50=60.71 (62.74)
+2025-03-03 20:12:49 | INFO     | utils.misc:108 - Training: Epoch=[8/50] [ 300/1407]  Batch=1.13 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.4527 (0.4435)  IoU=61.08 (57.48)  Prec@50=67.06 (63.02)
+2025-03-03 20:15:02 | INFO     | utils.misc:108 - Training: Epoch=[8/50] [ 400/1407]  Batch=1.25 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.4179 (0.4455)  IoU=59.71 (57.75)  Prec@50=65.75 (63.11)
+2025-03-03 20:17:13 | INFO     | utils.misc:108 - Training: Epoch=[8/50] [ 500/1407]  Batch=1.26 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.4548 (0.4472)  IoU=59.80 (57.79)  Prec@50=66.67 (63.14)
+2025-03-03 20:19:26 | INFO     | utils.misc:108 - Training: Epoch=[8/50] [ 600/1407]  Batch=1.25 (1.32)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.4598 (0.4486)  IoU=49.73 (57.75)  Prec@50=53.57 (62.92)
+2025-03-03 20:21:38 | INFO     | utils.misc:108 - Training: Epoch=[8/50] [ 700/1407]  Batch=1.24 (1.32)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.4703 (0.4484)  IoU=56.49 (57.69)  Prec@50=60.24 (62.81)
+2025-03-03 20:23:48 | INFO     | utils.misc:108 - Training: Epoch=[8/50] [ 800/1407]  Batch=1.16 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.5171 (0.4514)  IoU=57.90 (57.64)  Prec@50=64.52 (62.68)
+2025-03-03 20:25:58 | INFO     | utils.misc:108 - Training: Epoch=[8/50] [ 900/1407]  Batch=1.22 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.5224 (0.4527)  IoU=59.30 (57.70)  Prec@50=61.31 (62.74)
+2025-03-03 20:28:09 | INFO     | utils.misc:108 - Training: Epoch=[8/50] [1000/1407]  Batch=1.22 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.5577 (0.4544)  IoU=52.09 (57.60)  Prec@50=56.67 (62.56)
+2025-03-03 20:30:22 | INFO     | utils.misc:108 - Training: Epoch=[8/50] [1100/1407]  Batch=1.23 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.4545 (0.4550)  IoU=56.95 (57.56)  Prec@50=62.40 (62.53)
+2025-03-03 20:32:33 | INFO     | utils.misc:108 - Training: Epoch=[8/50] [1200/1407]  Batch=1.23 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.3771 (0.4548)  IoU=61.85 (57.58)  Prec@50=72.30 (62.52)
+2025-03-03 20:34:45 | INFO     | utils.misc:108 - Training: Epoch=[8/50] [1300/1407]  Batch=1.32 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.4313 (0.4551)  IoU=60.03 (57.54)  Prec@50=65.79 (62.46)
+2025-03-03 20:36:57 | INFO     | utils.misc:108 - Training: Epoch=[8/50] [1400/1407]  Batch=1.21 (1.32)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.5147 (0.4564)  IoU=53.92 (57.49)  Prec@50=52.18 (62.34)
+2025-03-03 20:37:41 | INFO     | engine.engine_gref:166 - Evaluation: Epoch=[8/50]  mIoU=62.41  oIoU=58.68  Pr@50: 69.23  Pr@60: 61.46  Pr@70: 53.65  Pr@80: 41.69  Pr@90: 18.92
+2025-03-03 20:40:22 | INFO     | utils.misc:108 - Training: Epoch=[9/50] [ 100/1407]  Batch=1.15 (1.32)  Data=0.01 (0.03)  Lr=0.000100  Loss=0.3586 (0.4313)  IoU=70.73 (59.54)  Prec@50=83.97 (64.88)
+2025-03-03 20:42:34 | INFO     | utils.misc:108 - Training: Epoch=[9/50] [ 200/1407]  Batch=1.17 (1.32)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.4021 (0.4262)  IoU=58.45 (59.39)  Prec@50=69.84 (64.84)
+2025-03-03 20:44:44 | INFO     | utils.misc:108 - Training: Epoch=[9/50] [ 300/1407]  Batch=1.21 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.5048 (0.4308)  IoU=62.54 (59.46)  Prec@50=73.25 (65.16)
+2025-03-03 20:46:54 | INFO     | utils.misc:108 - Training: Epoch=[9/50] [ 400/1407]  Batch=1.13 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.5139 (0.4306)  IoU=57.54 (59.52)  Prec@50=63.33 (65.13)
+2025-03-03 20:49:05 | INFO     | utils.misc:108 - Training: Epoch=[9/50] [ 500/1407]  Batch=1.14 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.3949 (0.4328)  IoU=64.36 (59.33)  Prec@50=70.08 (64.82)
+2025-03-03 20:51:16 | INFO     | utils.misc:108 - Training: Epoch=[9/50] [ 600/1407]  Batch=1.26 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.4611 (0.4326)  IoU=57.47 (59.35)  Prec@50=60.89 (64.74)
+2025-03-03 20:53:26 | INFO     | utils.misc:108 - Training: Epoch=[9/50] [ 700/1407]  Batch=1.17 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.4755 (0.4334)  IoU=56.52 (59.25)  Prec@50=56.35 (64.61)
+2025-03-03 20:55:37 | INFO     | utils.misc:108 - Training: Epoch=[9/50] [ 800/1407]  Batch=1.29 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.4796 (0.4337)  IoU=60.29 (59.28)  Prec@50=68.59 (64.69)
+2025-03-03 20:57:47 | INFO     | utils.misc:108 - Training: Epoch=[9/50] [ 900/1407]  Batch=1.24 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.4704 (0.4349)  IoU=52.80 (59.28)  Prec@50=63.89 (64.72)
+2025-03-03 20:59:59 | INFO     | utils.misc:108 - Training: Epoch=[9/50] [1000/1407]  Batch=1.22 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.3763 (0.4347)  IoU=65.34 (59.13)  Prec@50=72.64 (64.62)
+2025-03-03 21:02:11 | INFO     | utils.misc:108 - Training: Epoch=[9/50] [1100/1407]  Batch=1.42 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.4699 (0.4350)  IoU=59.93 (59.02)  Prec@50=65.93 (64.56)
+2025-03-03 21:04:23 | INFO     | utils.misc:108 - Training: Epoch=[9/50] [1200/1407]  Batch=1.21 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.3397 (0.4357)  IoU=65.16 (58.84)  Prec@50=72.56 (64.37)
+2025-03-03 21:06:34 | INFO     | utils.misc:108 - Training: Epoch=[9/50] [1300/1407]  Batch=1.26 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.6262 (0.4363)  IoU=48.86 (58.78)  Prec@50=47.62 (64.23)
+2025-03-03 21:08:46 | INFO     | utils.misc:108 - Training: Epoch=[9/50] [1400/1407]  Batch=1.35 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.4082 (0.4374)  IoU=63.09 (58.70)  Prec@50=74.50 (64.19)
+2025-03-03 21:09:29 | INFO     | engine.engine_gref:166 - Evaluation: Epoch=[9/50]  mIoU=62.25  oIoU=58.06  Pr@50: 69.23  Pr@60: 63.05  Pr@70: 54.78  Pr@80: 42.58  Pr@90: 20.16
+2025-03-03 21:11:56 | INFO     | utils.misc:108 - Training: Epoch=[10/50] [ 100/1407]  Batch=1.28 (1.33)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.3925 (0.4111)  IoU=55.59 (59.60)  Prec@50=61.05 (66.27)
+2025-03-03 21:14:06 | INFO     | utils.misc:108 - Training: Epoch=[10/50] [ 200/1407]  Batch=1.19 (1.32)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.4349 (0.4082)  IoU=58.24 (59.65)  Prec@50=63.89 (66.06)
+2025-03-03 21:16:18 | INFO     | utils.misc:108 - Training: Epoch=[10/50] [ 300/1407]  Batch=1.72 (1.32)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.4199 (0.4066)  IoU=59.00 (59.75)  Prec@50=65.48 (65.97)
+2025-03-03 21:18:31 | INFO     | utils.misc:108 - Training: Epoch=[10/50] [ 400/1407]  Batch=1.18 (1.32)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.3139 (0.4110)  IoU=64.88 (59.49)  Prec@50=76.98 (65.56)
+2025-03-03 21:20:42 | INFO     | utils.misc:108 - Training: Epoch=[10/50] [ 500/1407]  Batch=1.24 (1.32)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.4145 (0.4128)  IoU=58.26 (59.39)  Prec@50=63.10 (65.38)
+2025-03-03 21:22:55 | INFO     | utils.misc:108 - Training: Epoch=[10/50] [ 600/1407]  Batch=1.47 (1.32)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.4332 (0.4123)  IoU=55.08 (59.26)  Prec@50=59.82 (65.29)
+2025-03-03 21:25:06 | INFO     | utils.misc:108 - Training: Epoch=[10/50] [ 700/1407]  Batch=1.37 (1.32)  Data=0.01 (0.03)  Lr=0.000100  Loss=0.5174 (0.4145)  IoU=49.25 (59.16)  Prec@50=51.03 (65.12)
+2025-03-03 21:27:17 | INFO     | utils.misc:108 - Training: Epoch=[10/50] [ 800/1407]  Batch=1.13 (1.32)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.4140 (0.4154)  IoU=55.88 (59.11)  Prec@50=62.70 (65.09)
+2025-03-03 21:29:28 | INFO     | utils.misc:108 - Training: Epoch=[10/50] [ 900/1407]  Batch=1.36 (1.32)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.4532 (0.4157)  IoU=56.31 (59.08)  Prec@50=59.76 (65.11)
+2025-03-03 21:31:37 | INFO     | utils.misc:108 - Training: Epoch=[10/50] [1000/1407]  Batch=1.22 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.4510 (0.4167)  IoU=54.24 (59.11)  Prec@50=59.07 (65.15)
+2025-03-03 21:33:46 | INFO     | utils.misc:108 - Training: Epoch=[10/50] [1100/1407]  Batch=1.25 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.4805 (0.4178)  IoU=54.15 (59.11)  Prec@50=53.57 (65.15)
+2025-03-03 21:35:55 | INFO     | utils.misc:108 - Training: Epoch=[10/50] [1200/1407]  Batch=1.17 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.4031 (0.4190)  IoU=61.14 (59.09)  Prec@50=66.90 (65.10)
+2025-03-03 21:38:05 | INFO     | utils.misc:108 - Training: Epoch=[10/50] [1300/1407]  Batch=1.38 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.4519 (0.4200)  IoU=53.42 (59.04)  Prec@50=58.10 (65.00)
+2025-03-03 21:40:16 | INFO     | utils.misc:108 - Training: Epoch=[10/50] [1400/1407]  Batch=1.15 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.4071 (0.4211)  IoU=64.93 (59.02)  Prec@50=74.13 (64.97)
+2025-03-03 21:40:58 | INFO     | engine.engine_gref:166 - Evaluation: Epoch=[10/50]  mIoU=62.92  oIoU=59.93  Pr@50: 70.59  Pr@60: 64.65  Pr@70: 55.67  Pr@80: 43.43  Pr@90: 20.32
+2025-03-03 21:43:38 | INFO     | utils.misc:108 - Training: Epoch=[11/50] [ 100/1407]  Batch=1.28 (1.32)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.4913 (0.3952)  IoU=56.25 (61.12)  Prec@50=59.94 (67.24)
+2025-03-03 21:45:49 | INFO     | utils.misc:108 - Training: Epoch=[11/50] [ 200/1407]  Batch=1.39 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.4227 (0.3953)  IoU=61.48 (60.92)  Prec@50=66.01 (66.88)
+2025-03-03 21:47:57 | INFO     | utils.misc:108 - Training: Epoch=[11/50] [ 300/1407]  Batch=1.24 (1.30)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.2995 (0.3939)  IoU=66.41 (61.32)  Prec@50=76.88 (67.40)
+2025-03-03 21:50:08 | INFO     | utils.misc:108 - Training: Epoch=[11/50] [ 400/1407]  Batch=1.29 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.3530 (0.3954)  IoU=63.13 (61.36)  Prec@50=66.90 (67.54)
+2025-03-03 21:52:18 | INFO     | utils.misc:108 - Training: Epoch=[11/50] [ 500/1407]  Batch=1.28 (1.30)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.3185 (0.3975)  IoU=61.71 (61.22)  Prec@50=66.07 (67.42)
+2025-03-03 21:54:28 | INFO     | utils.misc:108 - Training: Epoch=[11/50] [ 600/1407]  Batch=1.22 (1.30)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.3977 (0.3992)  IoU=57.77 (60.99)  Prec@50=64.72 (67.13)
+2025-03-03 21:56:38 | INFO     | utils.misc:108 - Training: Epoch=[11/50] [ 700/1407]  Batch=1.23 (1.30)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.4287 (0.3990)  IoU=58.03 (60.86)  Prec@50=61.11 (67.02)
+2025-03-03 21:58:50 | INFO     | utils.misc:108 - Training: Epoch=[11/50] [ 800/1407]  Batch=1.24 (1.30)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.4175 (0.3999)  IoU=62.48 (60.75)  Prec@50=69.74 (66.85)
+2025-03-03 22:01:01 | INFO     | utils.misc:108 - Training: Epoch=[11/50] [ 900/1407]  Batch=1.27 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.4177 (0.4006)  IoU=56.65 (60.83)  Prec@50=60.22 (66.99)
+2025-03-03 22:03:14 | INFO     | utils.misc:108 - Training: Epoch=[11/50] [1000/1407]  Batch=1.34 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.3733 (0.4012)  IoU=64.66 (60.94)  Prec@50=75.50 (67.11)
+2025-03-03 22:05:25 | INFO     | utils.misc:108 - Training: Epoch=[11/50] [1100/1407]  Batch=1.16 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.3426 (0.4011)  IoU=72.35 (61.04)  Prec@50=86.75 (67.21)
+2025-03-03 22:07:36 | INFO     | utils.misc:108 - Training: Epoch=[11/50] [1200/1407]  Batch=1.22 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.3453 (0.4008)  IoU=65.79 (61.17)  Prec@50=76.83 (67.42)
+2025-03-03 22:09:48 | INFO     | utils.misc:108 - Training: Epoch=[11/50] [1300/1407]  Batch=1.64 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.4473 (0.4011)  IoU=59.89 (61.20)  Prec@50=73.39 (67.39)
+2025-03-03 22:12:00 | INFO     | utils.misc:108 - Training: Epoch=[11/50] [1400/1407]  Batch=1.23 (1.31)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.4229 (0.4018)  IoU=57.42 (61.18)  Prec@50=66.96 (67.32)
+2025-03-03 22:12:46 | INFO     | engine.engine_gref:166 - Evaluation: Epoch=[11/50]  mIoU=63.56  oIoU=60.39  Pr@50: 71.37  Pr@60: 65.11  Pr@70: 57.58  Pr@80: 44.68  Pr@90: 20.90
+2025-03-03 22:15:27 | INFO     | utils.misc:108 - Training: Epoch=[12/50] [ 100/1407]  Batch=1.32 (1.35)  Data=0.00 (0.04)  Lr=0.000100  Loss=0.3085 (0.3645)  IoU=65.95 (62.33)  Prec@50=78.01 (69.10)
+2025-03-03 22:17:39 | INFO     | utils.misc:108 - Training: Epoch=[12/50] [ 200/1407]  Batch=1.66 (1.33)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.2717 (0.3668)  IoU=68.88 (62.85)  Prec@50=78.70 (69.82)
+2025-03-03 22:19:51 | INFO     | utils.misc:108 - Training: Epoch=[12/50] [ 300/1407]  Batch=1.14 (1.33)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.3265 (0.3708)  IoU=62.95 (62.78)  Prec@50=68.89 (69.53)
+2025-03-03 22:22:04 | INFO     | utils.misc:108 - Training: Epoch=[12/50] [ 400/1407]  Batch=1.13 (1.33)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.3058 (0.3704)  IoU=68.49 (62.68)  Prec@50=73.65 (69.41)
+2025-03-03 22:24:17 | INFO     | utils.misc:108 - Training: Epoch=[12/50] [ 500/1407]  Batch=1.25 (1.33)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.3924 (0.3704)  IoU=69.36 (62.64)  Prec@50=85.52 (69.33)
+2025-03-03 22:26:26 | INFO     | utils.misc:108 - Training: Epoch=[12/50] [ 600/1407]  Batch=1.16 (1.32)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.2908 (0.3704)  IoU=66.72 (62.64)  Prec@50=72.06 (69.23)
+2025-03-03 22:28:38 | INFO     | utils.misc:108 - Training: Epoch=[12/50] [ 700/1407]  Batch=1.30 (1.32)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.4189 (0.3716)  IoU=54.04 (62.63)  Prec@50=55.06 (69.14)
+2025-03-03 22:30:49 | INFO     | utils.misc:108 - Training: Epoch=[12/50] [ 800/1407]  Batch=1.26 (1.32)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.2783 (0.3722)  IoU=71.81 (62.64)  Prec@50=80.36 (69.16)
+2025-03-03 22:32:58 | INFO     | utils.misc:108 - Training: Epoch=[12/50] [ 900/1407]  Batch=1.23 (1.32)  Data=0.00 (0.03)  Lr=0.000100  Loss=0.3591 (0.3739)  IoU=69.03 (62.56)  Prec@50=79.31 (69.07)
+[2025-03-03 22:34:38,633] torch.distributed.elastic.agent.server.api: [WARNING] Received Signals.SIGINT death signal, shutting down workers
+[2025-03-03 22:34:38,634] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 2316571 closing signal SIGINT
+[2025-03-03 22:34:38,636] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 2316572 closing signal SIGINT
+[2025-03-03 22:34:38,637] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 2316573 closing signal SIGINT
+[2025-03-03 22:34:38,637] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 2316574 closing signal SIGINT
+[2025-03-03 22:34:38,637] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 2316575 closing signal SIGINT
+[2025-03-03 22:34:38,637] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 2316576 closing signal SIGINT
+Exception ignored in: [2025-03-03 22:34:38,812] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 2316571 closing signal SIGTERM
+<function _MultiProcessingDataLoaderIter.__del__ at 0x7fcbc8afc8b0>
+Traceback (most recent call last):
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1478, in __del__
+Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7fe97c7bf8b0>
+Traceback (most recent call last):
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1478, in __del__
+Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f7e575df8b0>
+Traceback (most recent call last):
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1478, in __del__
+Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f0f473318b0>
+Traceback (most recent call last):
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1478, in __del__
+Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f29013bb8b0>
+Traceback (most recent call last):
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1478, in __del__
+[2025-03-03 22:34:38,864] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 2316572 closing signal SIGTERM
+[2025-03-03 22:34:38,864] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 2316573 closing signal SIGTERM
+[2025-03-03 22:34:38,864] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 2316574 closing signal SIGTERM
+[2025-03-03 22:34:38,864] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 2316575 closing signal SIGTERM
+[2025-03-03 22:34:38,864] torch.distributed.elastic.multiprocessing.api: [WARNING] Sending process 2316576 closing signal SIGTERM
+Traceback (most recent call last):
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/torch/distributed/elastic/agent/server/api.py", line 736, in run
+    result = self._invoke_run(role)
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/torch/distributed/elastic/agent/server/api.py", line 877, in _invoke_run
+    time.sleep(monitor_interval)
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 62, in _terminate_process_handler
+    raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval)
+torch.distributed.elastic.multiprocessing.api.SignalException: Process 2316558 got signal: 2
+During handling of the above exception, another exception occurred:
+Traceback (most recent call last):
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/torch/distributed/elastic/agent/server/api.py", line 743, in run
+    self._shutdown(e.sigval)
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/torch/distributed/elastic/agent/server/local_elastic_agent.py", line 289, in _shutdown
+    self._pcontext.close(death_sig)
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 331, in close
+    self._close(death_sig=death_sig, timeout=timeout)
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 713, in _close
+    handler.proc.wait(time_to_wait)
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/subprocess.py", line 1189, in wait
+    return self._wait(timeout=timeout)
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/subprocess.py", line 1927, in _wait
+    time.sleep(delay)
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 62, in _terminate_process_handler
+    raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval)
+torch.distributed.elastic.multiprocessing.api.SignalException: Process 2316558 got signal: 2
+During handling of the above exception, another exception occurred:
+Traceback (most recent call last):
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 255, in launch_agent
+    result = agent.run()
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/torch/distributed/elastic/metrics/api.py", line 124, in wrapper
+    result = f(*args, **kwargs)
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/torch/distributed/elastic/agent/server/api.py", line 748, in run
+    self._shutdown()
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/torch/distributed/elastic/agent/server/local_elastic_agent.py", line 289, in _shutdown
+    self._pcontext.close(death_sig)
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 331, in close
+    self._close(death_sig=death_sig, timeout=timeout)
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 713, in _close
+    handler.proc.wait(time_to_wait)
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/subprocess.py", line 1189, in wait
+    return self._wait(timeout=timeout)
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/subprocess.py", line 1927, in _wait
+    time.sleep(delay)
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 62, in _terminate_process_handler
+    raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval)
+torch.distributed.elastic.multiprocessing.api.SignalException: Process 2316558 got signal: 2
+During handling of the above exception, another exception occurred:
+Traceback (most recent call last):
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/runpy.py", line 197, in _run_module_as_main
+    return _run_code(code, main_globals, None,
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/runpy.py", line 87, in _run_code
+    exec(code, run_globals)
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/torch/distributed/launch.py", line 196, in <module>
+    main()
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/torch/distributed/launch.py", line 192, in main
+    launch(args)
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/torch/distributed/launch.py", line 177, in launch
+    run(args)
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/torch/distributed/run.py", line 797, in run
+    elastic_launch(
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 134, in __call__
+    return launch_agent(self._config, self._entrypoint, list(args))
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 277, in launch_agent
+    events.record(agent.get_event_failed())
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/torch/distributed/elastic/agent/server/api.py", line 756, in get_event_failed
+    raw_error=traceback.format_exc(),
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/traceback.py", line 167, in format_exc
+    return "".join(format_exception(*sys.exc_info(), limit=limit, chain=chain))
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/traceback.py", line 120, in format_exception
+    return list(TracebackException(
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/traceback.py", line 517, in __init__
+    self.stack = StackSummary.extract(
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/traceback.py", line 366, in extract
+    f.line
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/traceback.py", line 288, in line
+    self._line = linecache.getline(self.filename, self.lineno).strip()
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/linecache.py", line 30, in getline
+    lines = getlines(filename, module_globals)
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/linecache.py", line 46, in getlines
+    return updatecache(filename, module_globals)
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/linecache.py", line 93, in updatecache
+    stat = os.stat(fullname)
+  File "/home/seunghoon/.conda/envs/ris_all/lib/python3.9/site-packages/torch/distributed/elastic/multiprocessing/api.py", line 62, in _terminate_process_handler
+    raise SignalException(f"Process {os.getpid()} got signal: {sigval}", sigval=sigval)
+torch.distributed.elastic.multiprocessing.api.SignalException: Process 2316558 got signal: 2

CGFormer/bash_logs/sanity_node03.log ADDED Viewed

The diff for this file is too large to render. See raw diff

CGFormer/bert/__pycache__/activations.cpython-38.pyc ADDED Viewed

Binary file (1.95 kB). View file

CGFormer/bert/__pycache__/activations.cpython-39.pyc ADDED Viewed

Binary file (1.94 kB). View file

CGFormer/bert/__pycache__/configuration_bert.cpython-38.pyc ADDED Viewed

Binary file (7.88 kB). View file

CGFormer/bert/__pycache__/configuration_bert.cpython-39.pyc ADDED Viewed

Binary file (7.88 kB). View file

CGFormer/bert/__pycache__/configuration_utils.cpython-38.pyc ADDED Viewed

Binary file (16.3 kB). View file

CGFormer/bert/__pycache__/configuration_utils.cpython-39.pyc ADDED Viewed

Binary file (16.3 kB). View file

CGFormer/bert/__pycache__/file_utils.cpython-38.pyc ADDED Viewed

Binary file (24.5 kB). View file

CGFormer/bert/__pycache__/file_utils.cpython-39.pyc ADDED Viewed

Binary file (24.7 kB). View file

CGFormer/bert/__pycache__/generation_utils.cpython-38.pyc ADDED Viewed

Binary file (28.2 kB). View file

CGFormer/bert/__pycache__/generation_utils.cpython-39.pyc ADDED Viewed

Binary file (28 kB). View file

CGFormer/bert/__pycache__/modeling_bert.cpython-38.pyc ADDED Viewed

Binary file (55.3 kB). View file

CGFormer/bert/__pycache__/modeling_bert.cpython-39.pyc ADDED Viewed

Binary file (55.2 kB). View file

CGFormer/bert/__pycache__/modeling_utils.cpython-38.pyc ADDED Viewed

Binary file (48 kB). View file

CGFormer/bert/__pycache__/modeling_utils.cpython-39.pyc ADDED Viewed

Binary file (48 kB). View file

CGFormer/bert/__pycache__/tokenization_bert.cpython-38.pyc ADDED Viewed

Binary file (19.3 kB). View file

CGFormer/bert/__pycache__/tokenization_bert.cpython-39.pyc ADDED Viewed

Binary file (19.3 kB). View file

CGFormer/bert/__pycache__/tokenization_utils.cpython-38.pyc ADDED Viewed

Binary file (24.9 kB). View file

CGFormer/bert/__pycache__/tokenization_utils.cpython-39.pyc ADDED Viewed

Binary file (24.9 kB). View file

CGFormer/bert/__pycache__/tokenization_utils_base.cpython-38.pyc ADDED Viewed

Binary file (82.4 kB). View file

CGFormer/bert/__pycache__/tokenization_utils_base.cpython-39.pyc ADDED Viewed

Binary file (82.4 kB). View file

CGFormer/bert/activations.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import logging
+import math
+import torch
+import torch.nn.functional as F
+logger = logging.getLogger(__name__)
+def swish(x):
+    return x * torch.sigmoid(x)
+def _gelu_python(x):
+    """ Original Implementation of the gelu activation function in Google Bert repo when initially created.
+        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
+        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+        This is now written in C in torch.nn.functional
+        Also see https://arxiv.org/abs/1606.08415
+    """
+    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
+def gelu_new(x):
+    """ Implementation of the gelu activation function currently in Google Bert repo (identical to OpenAI GPT).
+        Also see https://arxiv.org/abs/1606.08415
+    """
+    return 0.5 * x * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3.0))))
+if torch.__version__ < "1.4.0":
+    gelu = _gelu_python
+else:
+    gelu = F.gelu
+def gelu_fast(x):
+    return 0.5 * x * (1.0 + torch.tanh(x * 0.7978845608 * (1.0 + 0.044715 * x * x)))
+ACT2FN = {
+    "relu": F.relu,
+    "swish": swish,
+    "gelu": gelu,
+    "tanh": torch.tanh,
+    "gelu_new": gelu_new,
+    "gelu_fast": gelu_fast,
+}
+def get_activation(activation_string):
+    if activation_string in ACT2FN:
+        return ACT2FN[activation_string]
+    else:
+        raise KeyError("function {} not found in ACT2FN mapping {}".format(activation_string, list(ACT2FN.keys())))

CGFormer/bert/configuration_bert.py ADDED Viewed

	@@ -0,0 +1,143 @@

+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" BERT model configuration """
+import logging
+from .configuration_utils import PretrainedConfig
+logger = logging.getLogger(__name__)
+BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "bert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json",
+    "bert-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-config.json",
+    "bert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json",
+    "bert-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-config.json",
+    "bert-base-multilingual-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-config.json",
+    "bert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-config.json",
+    "bert-base-chinese": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-config.json",
+    "bert-base-german-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-cased-config.json",
+    "bert-large-uncased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-config.json",
+    "bert-large-cased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-config.json",
+    "bert-large-uncased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-config.json",
+    "bert-large-cased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-config.json",
+    "bert-base-cased-finetuned-mrpc": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json",
+    "bert-base-german-dbmdz-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-config.json",
+    "bert-base-german-dbmdz-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-config.json",
+    "cl-tohoku/bert-base-japanese": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese/config.json",
+    "cl-tohoku/bert-base-japanese-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking/config.json",
+    "cl-tohoku/bert-base-japanese-char": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char/config.json",
+    "cl-tohoku/bert-base-japanese-char-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking/config.json",
+    "TurkuNLP/bert-base-finnish-cased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/config.json",
+    "TurkuNLP/bert-base-finnish-uncased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/config.json",
+    "wietsedv/bert-base-dutch-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/wietsedv/bert-base-dutch-cased/config.json",
+    # See all BERT models at https://huggingface.co/models?filter=bert
+}
+class BertConfig(PretrainedConfig):
+    r"""
+        This is the configuration class to store the configuration of a :class:`~transformers.BertModel`.
+        It is used to instantiate an BERT model according to the specified arguments, defining the model
+        architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
+        the BERT `bert-base-uncased <https://huggingface.co/bert-base-uncased>`__ architecture.
+        Configuration objects inherit from  :class:`~transformers.PretrainedConfig` and can be used
+        to control the model outputs. Read the documentation from  :class:`~transformers.PretrainedConfig`
+        for more information.
+        Args:
+            vocab_size (:obj:`int`, optional, defaults to 30522):
+                Vocabulary size of the BERT model. Defines the different tokens that
+                can be represented by the `inputs_ids` passed to the forward method of :class:`~transformers.BertModel`.
+            hidden_size (:obj:`int`, optional, defaults to 768):
+                Dimensionality of the encoder layers and the pooler layer.
+            num_hidden_layers (:obj:`int`, optional, defaults to 12):
+                Number of hidden layers in the Transformer encoder.
+            num_attention_heads (:obj:`int`, optional, defaults to 12):
+                Number of attention heads for each attention layer in the Transformer encoder.
+            intermediate_size (:obj:`int`, optional, defaults to 3072):
+                Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+            hidden_act (:obj:`str` or :obj:`function`, optional, defaults to "gelu"):
+                The non-linear activation function (function or string) in the encoder and pooler.
+                If string, "gelu", "relu", "swish" and "gelu_new" are supported.
+            hidden_dropout_prob (:obj:`float`, optional, defaults to 0.1):
+                The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
+            attention_probs_dropout_prob (:obj:`float`, optional, defaults to 0.1):
+                The dropout ratio for the attention probabilities.
+            max_position_embeddings (:obj:`int`, optional, defaults to 512):
+                The maximum sequence length that this model might ever be used with.
+                Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
+            type_vocab_size (:obj:`int`, optional, defaults to 2):
+                The vocabulary size of the `token_type_ids` passed into :class:`~transformers.BertModel`.
+            initializer_range (:obj:`float`, optional, defaults to 0.02):
+                The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+            layer_norm_eps (:obj:`float`, optional, defaults to 1e-12):
+                The epsilon used by the layer normalization layers.
+            gradient_checkpointing (:obj:`bool`, optional, defaults to False):
+                If True, use gradient checkpointing to save memory at the expense of slower backward pass.
+        Example::
+            >>> from transformers import BertModel, BertConfig
+            >>> # Initializing a BERT bert-base-uncased style configuration
+            >>> configuration = BertConfig()
+            >>> # Initializing a model from the bert-base-uncased style configuration
+            >>> model = BertModel(configuration)
+            >>> # Accessing the model configuration
+            >>> configuration = model.config
+    """
+    model_type = "bert"
+    def __init__(
+        self,
+        vocab_size=30522,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        pad_token_id=0,
+        gradient_checkpointing=False,
+        **kwargs
+    ):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.gradient_checkpointing = gradient_checkpointing

CGFormer/bert/configuration_utils.py ADDED Viewed

	@@ -0,0 +1,408 @@

+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Configuration base class and utilities."""
+import copy
+import json
+import logging
+import os
+from typing import Dict, Tuple
+from .file_utils import CONFIG_NAME, cached_path, hf_bucket_url, is_remote_url
+logger = logging.getLogger(__name__)
+class PretrainedConfig(object):
+    r""" Base class for all configuration classes.
+        Handles a few parameters common to all models' configurations as well as methods for loading/downloading/saving configurations.
+        Note:
+            A configuration file can be loaded and saved to disk. Loading the configuration file and using this file to initialize a model does **not** load the model weights.
+            It only affects the model's configuration.
+        Class attributes (overridden by derived classes):
+            - ``model_type``: a string that identifies the model type, that we serialize into the JSON file, and that we use to recreate the correct object in :class:`~transformers.AutoConfig`.
+        Args:
+            finetuning_task (:obj:`string` or :obj:`None`, `optional`, defaults to :obj:`None`):
+                Name of the task used to fine-tune the model. This can be used when converting from an original (TensorFlow or PyTorch) checkpoint.
+            num_labels (:obj:`int`, `optional`, defaults to `2`):
+                Number of classes to use when the model is a classification model (sequences/tokens)
+            output_hidden_states (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Should the model returns all hidden-states.
+            output_attentions (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Should the model returns all attentions.
+            torchscript (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Is the model used with Torchscript (for PyTorch models).
+    """
+    model_type: str = ""
+    def __init__(self, **kwargs):
+        # Attributes with defaults
+        self.output_hidden_states = kwargs.pop("output_hidden_states", False)
+        self.output_attentions = kwargs.pop("output_attentions", False)
+        self.use_cache = kwargs.pop("use_cache", True)  # Not used by all models
+        self.torchscript = kwargs.pop("torchscript", False)  # Only used by PyTorch models
+        self.use_bfloat16 = kwargs.pop("use_bfloat16", False)
+        self.pruned_heads = kwargs.pop("pruned_heads", {})
+        # Is decoder is used in encoder-decoder models to differentiate encoder from decoder
+        self.is_encoder_decoder = kwargs.pop("is_encoder_decoder", False)
+        self.is_decoder = kwargs.pop("is_decoder", False)
+        # Parameters for sequence generation
+        self.max_length = kwargs.pop("max_length", 20)
+        self.min_length = kwargs.pop("min_length", 0)
+        self.do_sample = kwargs.pop("do_sample", False)
+        self.early_stopping = kwargs.pop("early_stopping", False)
+        self.num_beams = kwargs.pop("num_beams", 1)
+        self.temperature = kwargs.pop("temperature", 1.0)
+        self.top_k = kwargs.pop("top_k", 50)
+        self.top_p = kwargs.pop("top_p", 1.0)
+        self.repetition_penalty = kwargs.pop("repetition_penalty", 1.0)
+        self.length_penalty = kwargs.pop("length_penalty", 1.0)
+        self.no_repeat_ngram_size = kwargs.pop("no_repeat_ngram_size", 0)
+        self.bad_words_ids = kwargs.pop("bad_words_ids", None)
+        self.num_return_sequences = kwargs.pop("num_return_sequences", 1)
+        # Fine-tuning task arguments
+        self.architectures = kwargs.pop("architectures", None)
+        self.finetuning_task = kwargs.pop("finetuning_task", None)
+        self.id2label = kwargs.pop("id2label", None)
+        self.label2id = kwargs.pop("label2id", None)
+        if self.id2label is not None:
+            kwargs.pop("num_labels", None)
+            self.id2label = dict((int(key), value) for key, value in self.id2label.items())
+            # Keys are always strings in JSON so convert ids to int here.
+        else:
+            self.num_labels = kwargs.pop("num_labels", 2)
+        # Tokenizer arguments TODO: eventually tokenizer and models should share the same config
+        self.prefix = kwargs.pop("prefix", None)
+        self.bos_token_id = kwargs.pop("bos_token_id", None)
+        self.pad_token_id = kwargs.pop("pad_token_id", None)
+        self.eos_token_id = kwargs.pop("eos_token_id", None)
+        self.decoder_start_token_id = kwargs.pop("decoder_start_token_id", None)
+        # task specific arguments
+        self.task_specific_params = kwargs.pop("task_specific_params", None)
+        # TPU arguments
+        self.xla_device = kwargs.pop("xla_device", None)
+        # Additional attributes without default values
+        for key, value in kwargs.items():
+            try:
+                setattr(self, key, value)
+            except AttributeError as err:
+                logger.error("Can't set {} with value {} for {}".format(key, value, self))
+                raise err
+    @property
+    def num_labels(self):
+        return len(self.id2label)
+    @num_labels.setter
+    def num_labels(self, num_labels):
+        self.id2label = {i: "LABEL_{}".format(i) for i in range(num_labels)}
+        self.label2id = dict(zip(self.id2label.values(), self.id2label.keys()))
+    def save_pretrained(self, save_directory):
+        """
+        Save a configuration object to the directory `save_directory`, so that it
+        can be re-loaded using the :func:`~transformers.PretrainedConfig.from_pretrained` class method.
+        Args:
+            save_directory (:obj:`string`):
+                Directory where the configuration JSON file will be saved.
+        """
+        if os.path.isfile(save_directory):
+            raise AssertionError("Provided path ({}) should be a directory, not a file".format(save_directory))
+        os.makedirs(save_directory, exist_ok=True)
+        # If we save using the predefined names, we can load using `from_pretrained`
+        output_config_file = os.path.join(save_directory, CONFIG_NAME)
+        self.to_json_file(output_config_file, use_diff=True)
+        logger.info("Configuration saved in {}".format(output_config_file))
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs) -> "PretrainedConfig":
+        r"""
+        Instantiate a :class:`~transformers.PretrainedConfig` (or a derived class) from a pre-trained model configuration.
+        Args:
+            pretrained_model_name_or_path (:obj:`string`):
+                either:
+                  - a string with the `shortcut name` of a pre-trained model configuration to load from cache or
+                    download, e.g.: ``bert-base-uncased``.
+                  - a string with the `identifier name` of a pre-trained model configuration that was user-uploaded to
+                    our S3, e.g.: ``dbmdz/bert-base-german-cased``.
+                  - a path to a `directory` containing a configuration file saved using the
+                    :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
+                  - a path or url to a saved configuration JSON `file`, e.g.:
+                    ``./my_model_directory/configuration.json``.
+            cache_dir (:obj:`string`, `optional`):
+                Path to a directory in which a downloaded pre-trained model
+                configuration should be cached if the standard cache should not be used.
+            kwargs (:obj:`Dict[str, any]`, `optional`):
+                The values in kwargs of any keys which are configuration attributes will be used to override the loaded
+                values. Behavior concerning key/value pairs whose keys are *not* configuration attributes is
+                controlled by the `return_unused_kwargs` keyword parameter.
+            force_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Force to (re-)download the model weights and configuration files and override the cached versions if they exist.
+            resume_download (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
+            proxies (:obj:`Dict`, `optional`):
+                A dictionary of proxy servers to use by protocol or endpoint, e.g.:
+                :obj:`{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.`
+                The proxies are used on each request.
+            return_unused_kwargs: (`optional`) bool:
+                If False, then this function returns just the final configuration object.
+                If True, then this functions returns a :obj:`Tuple(config, unused_kwargs)` where `unused_kwargs` is a
+                dictionary consisting of the key/value pairs whose keys are not configuration attributes: ie the part
+                of kwargs which has not been used to update `config` and is otherwise ignored.
+        Returns:
+            :class:`PretrainedConfig`: An instance of a configuration object
+        Examples::
+            # We can't instantiate directly the base class `PretrainedConfig` so let's show the examples on a
+            # derived class: BertConfig
+            config = BertConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
+            config = BertConfig.from_pretrained('./test/saved_model/')  # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
+            config = BertConfig.from_pretrained('./test/saved_model/my_configuration.json')
+            config = BertConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False)
+            assert config.output_attention == True
+            config, unused_kwargs = BertConfig.from_pretrained('bert-base-uncased', output_attention=True,
+                                                               foo=False, return_unused_kwargs=True)
+            assert config.output_attention == True
+            assert unused_kwargs == {'foo': False}
+        """
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+        return cls.from_dict(config_dict, **kwargs)
+    @classmethod
+    def get_config_dict(cls, pretrained_model_name_or_path: str, **kwargs) -> Tuple[Dict, Dict]:
+        """
+        From a `pretrained_model_name_or_path`, resolve to a dictionary of parameters, to be used
+        for instantiating a Config using `from_dict`.
+        Parameters:
+            pretrained_model_name_or_path (:obj:`string`):
+                The identifier of the pre-trained checkpoint from which we want the dictionary of parameters.
+        Returns:
+            :obj:`Tuple[Dict, Dict]`: The dictionary that will be used to instantiate the configuration object.
+        """
+        cache_dir = kwargs.pop("cache_dir", None)
+        force_download = kwargs.pop("force_download", False)
+        resume_download = kwargs.pop("resume_download", False)
+        proxies = kwargs.pop("proxies", None)
+        local_files_only = kwargs.pop("local_files_only", False)
+        if os.path.isdir(pretrained_model_name_or_path):
+            config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
+        elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
+            config_file = pretrained_model_name_or_path
+        else:
+            config_file = hf_bucket_url(pretrained_model_name_or_path, filename=CONFIG_NAME, use_cdn=False)
+        try:
+            # Load from URL or cache if already cached
+            resolved_config_file = cached_path(
+                config_file,
+                cache_dir=cache_dir,
+                force_download=force_download,
+                proxies=proxies,
+                resume_download=resume_download,
+                local_files_only=local_files_only,
+            )
+            # Load config dict
+            if resolved_config_file is None:
+                raise EnvironmentError
+            config_dict = cls._dict_from_json_file(resolved_config_file)
+        except EnvironmentError:
+            msg = (
+                f"Can't load config for '{pretrained_model_name_or_path}'. Make sure that:\n\n"
+                f"- '{pretrained_model_name_or_path}' is a correct model identifier listed on 'https://huggingface.co/models'\n\n"
+                f"- or '{pretrained_model_name_or_path}' is the correct path to a directory containing a {CONFIG_NAME} file\n\n"
+            )
+            raise EnvironmentError(msg)
+        except json.JSONDecodeError:
+            msg = (
+                "Couldn't reach server at '{}' to download configuration file or "
+                "configuration file is not a valid JSON file. "
+                "Please check network or file content here: {}.".format(config_file, resolved_config_file)
+            )
+            raise EnvironmentError(msg)
+        if resolved_config_file == config_file:
+            logger.info("loading configuration file {}".format(config_file))
+        else:
+            logger.info("loading configuration file {} from cache at {}".format(config_file, resolved_config_file))
+        return config_dict, kwargs
+    @classmethod
+    def from_dict(cls, config_dict: Dict, **kwargs) -> "PretrainedConfig":
+        """
+        Constructs a `Config` from a Python dictionary of parameters.
+        Args:
+            config_dict (:obj:`Dict[str, any]`):
+                Dictionary that will be used to instantiate the configuration object. Such a dictionary can be retrieved
+                from a pre-trained checkpoint by leveraging the :func:`~transformers.PretrainedConfig.get_config_dict`
+                method.
+            kwargs (:obj:`Dict[str, any]`):
+                Additional parameters from which to initialize the configuration object.
+        Returns:
+            :class:`PretrainedConfig`: An instance of a configuration object
+        """
+        return_unused_kwargs = kwargs.pop("return_unused_kwargs", False)
+        config = cls(**config_dict)
+        if hasattr(config, "pruned_heads"):
+            config.pruned_heads = dict((int(key), value) for key, value in config.pruned_heads.items())
+        # Update config with kwargs if needed
+        to_remove = []
+        for key, value in kwargs.items():
+            if hasattr(config, key):
+                setattr(config, key, value)
+                to_remove.append(key)
+        for key in to_remove:
+            kwargs.pop(key, None)
+        logger.info("Model config %s", str(config))
+        if return_unused_kwargs:
+            return config, kwargs
+        else:
+            return config
+    @classmethod
+    def from_json_file(cls, json_file: str) -> "PretrainedConfig":
+        """
+        Constructs a `Config` from the path to a json file of parameters.
+        Args:
+            json_file (:obj:`string`):
+                Path to the JSON file containing the parameters.
+        Returns:
+            :class:`PretrainedConfig`: An instance of a configuration object
+        """
+        config_dict = cls._dict_from_json_file(json_file)
+        return cls(**config_dict)
+    @classmethod
+    def _dict_from_json_file(cls, json_file: str):
+        with open(json_file, "r", encoding="utf-8") as reader:
+            text = reader.read()
+        return json.loads(text)
+    def __eq__(self, other):
+        return self.__dict__ == other.__dict__
+    def __repr__(self):
+        return "{} {}".format(self.__class__.__name__, self.to_json_string())
+    def to_diff_dict(self):
+        """
+        Removes all attributes from config which correspond to the default
+        config attributes for better readability and serializes to a Python
+        dictionary.
+        Returns:
+            :obj:`Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        config_dict = self.to_dict()
+        # get the default config dict
+        default_config_dict = PretrainedConfig().to_dict()
+        serializable_config_dict = {}
+        # only serialize values that differ from the default config
+        for key, value in config_dict.items():
+            if key not in default_config_dict or value != default_config_dict[key]:
+                serializable_config_dict[key] = value
+        return serializable_config_dict
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary.
+        Returns:
+            :obj:`Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        if hasattr(self.__class__, "model_type"):
+            output["model_type"] = self.__class__.model_type
+        return output
+    def to_json_string(self, use_diff=True):
+        """
+        Serializes this instance to a JSON string.
+        Args:
+            use_diff (:obj:`bool`):
+                If set to True, only the difference between the config instance and the default PretrainedConfig() is serialized to JSON string.
+        Returns:
+            :obj:`string`: String containing all the attributes that make up this configuration instance in JSON format.
+        """
+        if use_diff is True:
+            config_dict = self.to_diff_dict()
+        else:
+            config_dict = self.to_dict()
+        return json.dumps(config_dict, indent=2, sort_keys=True) + "\n"
+    def to_json_file(self, json_file_path, use_diff=True):
+        """
+        Save this instance to a json file.
+        Args:
+            json_file_path (:obj:`string`):
+                Path to the JSON file in which this configuration instance's parameters will be saved.
+            use_diff (:obj:`bool`):
+                If set to True, only the difference between the config instance and the default PretrainedConfig() is serialized to JSON file.
+        """
+        with open(json_file_path, "w", encoding="utf-8") as writer:
+            writer.write(self.to_json_string(use_diff=use_diff))
+    def update(self, config_dict: Dict):
+        """
+        Updates attributes of this class
+        with attributes from `config_dict`.
+        Args:
+            :obj:`Dict[str, any]`: Dictionary of attributes that shall be updated for this class.
+        """
+        for key, value in config_dict.items():
+            setattr(self, key, value)

CGFormer/bert/file_utils.py ADDED Viewed

	@@ -0,0 +1,808 @@

+"""
+Utilities for working with the local dataset cache.
+This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp
+Copyright by the AllenNLP authors.
+"""
+import fnmatch
+import json
+import logging
+import os
+import shutil
+import sys
+import tarfile
+import tempfile
+from contextlib import contextmanager
+from functools import partial, wraps
+from hashlib import sha256
+from pathlib import Path
+from typing import Dict, Optional, Union
+from urllib.parse import urlparse
+from zipfile import ZipFile, is_zipfile
+import requests
+from filelock import FileLock
+from tqdm.auto import tqdm
+#from . import __version__
+__version__ = "3.0.2"
+logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+try:
+    USE_TF = os.environ.get("USE_TF", "AUTO").upper()
+    USE_TORCH = os.environ.get("USE_TORCH", "AUTO").upper()
+    if USE_TORCH in ("1", "ON", "YES", "AUTO") and USE_TF not in ("1", "ON", "YES"):
+        import torch
+        _torch_available = True  # pylint: disable=invalid-name
+        logger.info("PyTorch version {} available.".format(torch.__version__))
+    else:
+        logger.info("Disabling PyTorch because USE_TF is set")
+        _torch_available = False
+except ImportError:
+    _torch_available = False  # pylint: disable=invalid-name
+try:
+    USE_TF = os.environ.get("USE_TF", "AUTO").upper()
+    USE_TORCH = os.environ.get("USE_TORCH", "AUTO").upper()
+    if USE_TF in ("1", "ON", "YES", "AUTO") and USE_TORCH not in ("1", "ON", "YES"):
+        import tensorflow as tf
+        assert hasattr(tf, "__version__") and int(tf.__version__[0]) >= 2
+        _tf_available = True  # pylint: disable=invalid-name
+        logger.info("TensorFlow version {} available.".format(tf.__version__))
+    else:
+        logger.info("Disabling Tensorflow because USE_TORCH is set")
+        _tf_available = False
+except (ImportError, AssertionError):
+    _tf_available = False  # pylint: disable=invalid-name
+try:
+    from torch.hub import _get_torch_home
+    torch_cache_home = _get_torch_home()
+except ImportError:
+    torch_cache_home = os.path.expanduser(
+        os.getenv("TORCH_HOME", os.path.join(os.getenv("XDG_CACHE_HOME", "~/.cache"), "torch"))
+    )
+try:
+    import torch_xla.core.xla_model as xm  # noqa: F401
+    if _torch_available:
+        _torch_tpu_available = True  # pylint: disable=
+    else:
+        _torch_tpu_available = False
+except ImportError:
+    _torch_tpu_available = False
+try:
+    import psutil  # noqa: F401
+    _psutil_available = True
+except ImportError:
+    _psutil_available = False
+try:
+    import py3nvml  # noqa: F401
+    _py3nvml_available = True
+except ImportError:
+    _py3nvml_available = False
+try:
+    from apex import amp  # noqa: F401
+    _has_apex = True
+except ImportError:
+    _has_apex = False
+default_cache_path = os.path.join(torch_cache_home, "transformers")
+PYTORCH_PRETRAINED_BERT_CACHE = os.getenv("PYTORCH_PRETRAINED_BERT_CACHE", default_cache_path)
+PYTORCH_TRANSFORMERS_CACHE = os.getenv("PYTORCH_TRANSFORMERS_CACHE", PYTORCH_PRETRAINED_BERT_CACHE)
+TRANSFORMERS_CACHE = os.getenv("TRANSFORMERS_CACHE", PYTORCH_TRANSFORMERS_CACHE)
+WEIGHTS_NAME = "pytorch_model.bin"
+TF2_WEIGHTS_NAME = "tf_model.h5"
+TF_WEIGHTS_NAME = "model.ckpt"
+CONFIG_NAME = "config.json"
+MODEL_CARD_NAME = "modelcard.json"
+MULTIPLE_CHOICE_DUMMY_INPUTS = [[[0], [1]], [[0], [1]]]
+DUMMY_INPUTS = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
+DUMMY_MASK = [[1, 1, 1, 1, 1], [1, 1, 1, 0, 0], [0, 0, 0, 1, 1]]
+S3_BUCKET_PREFIX = "https://s3.amazonaws.com/models.huggingface.co/bert"
+CLOUDFRONT_DISTRIB_PREFIX = "https://cdn.huggingface.co"
+def is_torch_available():
+    return _torch_available
+def is_tf_available():
+    return _tf_available
+def is_torch_tpu_available():
+    return _torch_tpu_available
+def is_psutil_available():
+    return _psutil_available
+def is_py3nvml_available():
+    return _py3nvml_available
+def is_apex_available():
+    return _has_apex
+def add_start_docstrings(*docstr):
+    def docstring_decorator(fn):
+        fn.__doc__ = "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "")
+        return fn
+    return docstring_decorator
+def add_start_docstrings_to_callable(*docstr):
+    def docstring_decorator(fn):
+        class_name = ":class:`~transformers.{}`".format(fn.__qualname__.split(".")[0])
+        intro = "   The {} forward method, overrides the :func:`__call__` special method.".format(class_name)
+        note = r"""
+    .. note::
+        Although the recipe for forward pass needs to be defined within
+        this function, one should call the :class:`Module` instance afterwards
+        instead of this since the former takes care of running the
+        pre and post processing steps while the latter silently ignores them.
+        """
+        fn.__doc__ = intro + note + "".join(docstr) + (fn.__doc__ if fn.__doc__ is not None else "")
+        return fn
+    return docstring_decorator
+def add_end_docstrings(*docstr):
+    def docstring_decorator(fn):
+        fn.__doc__ = fn.__doc__ + "".join(docstr)
+        return fn
+    return docstring_decorator
+PT_TOKEN_CLASSIFICATION_SAMPLE = r"""
+    Example::
+        >>> from transformers import {tokenizer_class}, {model_class}
+        >>> import torch
+        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
+        >>> model = {model_class}.from_pretrained('{checkpoint}')
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> labels = torch.tensor([1] * inputs["input_ids"].size(1)).unsqueeze(0)  # Batch size 1
+        >>> outputs = model(**inputs, labels=labels)
+        >>> loss, scores = outputs[:2]
+"""
+PT_QUESTION_ANSWERING_SAMPLE = r"""
+    Example::
+        >>> from transformers import {tokenizer_class}, {model_class}
+        >>> import torch
+        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
+        >>> model = {model_class}.from_pretrained('{checkpoint}')
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> start_positions = torch.tensor([1])
+        >>> end_positions = torch.tensor([3])
+        >>> outputs = model(**inputs, start_positions=start_positions, end_positions=end_positions)
+        >>> loss, start_scores, end_scores = outputs[:3]
+"""
+PT_SEQUENCE_CLASSIFICATION_SAMPLE = r"""
+    Example::
+        >>> from transformers import {tokenizer_class}, {model_class}
+        >>> import torch
+        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
+        >>> model = {model_class}.from_pretrained('{checkpoint}')
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
+        >>> outputs = model(**inputs, labels=labels)
+        >>> loss, logits = outputs[:2]
+"""
+PT_MASKED_LM_SAMPLE = r"""
+    Example::
+        >>> from transformers import {tokenizer_class}, {model_class}
+        >>> import torch
+        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
+        >>> model = {model_class}.from_pretrained('{checkpoint}')
+        >>> input_ids = tokenizer("Hello, my dog is cute", return_tensors="pt")["input_ids"]
+        >>> outputs = model(input_ids, labels=input_ids)
+        >>> loss, prediction_scores = outputs[:2]
+"""
+PT_BASE_MODEL_SAMPLE = r"""
+    Example::
+        >>> from transformers import {tokenizer_class}, {model_class}
+        >>> import torch
+        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
+        >>> model = {model_class}.from_pretrained('{checkpoint}')
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+"""
+PT_MULTIPLE_CHOICE_SAMPLE = r"""
+    Example::
+        >>> from transformers import {tokenizer_class}, {model_class}
+        >>> import torch
+        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
+        >>> model = {model_class}.from_pretrained('{checkpoint}')
+        >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+        >>> choice0 = "It is eaten with a fork and a knife."
+        >>> choice1 = "It is eaten while held in the hand."
+        >>> labels = torch.tensor(0).unsqueeze(0)  # choice0 is correct (according to Wikipedia ;)), batch size 1
+        >>> encoding = tokenizer([[prompt, prompt], [choice0, choice1]], return_tensors='pt', padding=True)
+        >>> outputs = model(**{{k: v.unsqueeze(0) for k,v in encoding.items()}}, labels=labels)  # batch size is 1
+        >>> # the linear classifier still needs to be trained
+        >>> loss, logits = outputs[:2]
+"""
+PT_CAUSAL_LM_SAMPLE = r"""
+    Example::
+        >>> import torch
+        >>> from transformers import {tokenizer_class}, {model_class}
+        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
+        >>> model = {model_class}.from_pretrained('{checkpoint}')
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(**inputs, labels=inputs["input_ids"])
+        >>> loss, logits = outputs[:2]
+"""
+TF_TOKEN_CLASSIFICATION_SAMPLE = r"""
+    Example::
+        >>> from transformers import {tokenizer_class}, {model_class}
+        >>> import tensorflow as tf
+        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
+        >>> model = {model_class}.from_pretrained('{checkpoint}')
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
+        >>> input_ids = inputs["input_ids"]
+        >>> inputs["labels"] = tf.reshape(tf.constant([1] * tf.size(input_ids).numpy()), (-1, tf.size(input_ids))) # Batch size 1
+        >>> outputs = model(inputs)
+        >>> loss, scores = outputs[:2]
+"""
+TF_QUESTION_ANSWERING_SAMPLE = r"""
+    Example::
+        >>> from transformers import {tokenizer_class}, {model_class}
+        >>> import tensorflow as tf
+        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
+        >>> model = {model_class}.from_pretrained('{checkpoint}')
+        >>> question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
+        >>> input_dict = tokenizer(question, text, return_tensors='tf')
+        >>> start_scores, end_scores = model(input_dict)
+        >>> all_tokens = tokenizer.convert_ids_to_tokens(input_dict["input_ids"].numpy()[0])
+        >>> answer = ' '.join(all_tokens[tf.math.argmax(start_scores, 1)[0] : tf.math.argmax(end_scores, 1)[0]+1])
+"""
+TF_SEQUENCE_CLASSIFICATION_SAMPLE = r"""
+    Example::
+        >>> from transformers import {tokenizer_class}, {model_class}
+        >>> import tensorflow as tf
+        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
+        >>> model = {model_class}.from_pretrained('{checkpoint}')
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
+        >>> inputs["labels"] = tf.reshape(tf.constant(1), (-1, 1)) # Batch size 1
+        >>> outputs = model(inputs)
+        >>> loss, logits = outputs[:2]
+"""
+TF_MASKED_LM_SAMPLE = r"""
+    Example::
+        >>> from transformers import {tokenizer_class}, {model_class}
+        >>> import tensorflow as tf
+        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
+        >>> model = {model_class}.from_pretrained('{checkpoint}')
+        >>> input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
+        >>> outputs = model(input_ids)
+        >>> prediction_scores = outputs[0]
+"""
+TF_BASE_MODEL_SAMPLE = r"""
+    Example::
+        >>> from transformers import {tokenizer_class}, {model_class}
+        >>> import tensorflow as tf
+        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
+        >>> model = {model_class}.from_pretrained('{checkpoint}')
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
+        >>> outputs = model(inputs)
+        >>> last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+"""
+TF_MULTIPLE_CHOICE_SAMPLE = r"""
+    Example::
+        >>> from transformers import {tokenizer_class}, {model_class}
+        >>> import tensorflow as tf
+        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
+        >>> model = {model_class}.from_pretrained('{checkpoint}')
+        >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+        >>> choice0 = "It is eaten with a fork and a knife."
+        >>> choice1 = "It is eaten while held in the hand."
+        >>> encoding = tokenizer([[prompt, prompt], [choice0, choice1]], return_tensors='tf', padding=True)
+        >>> inputs = {{k: tf.expand_dims(v, 0) for k, v in encoding.items()}}
+        >>> outputs = model(inputs)  # batch size is 1
+        >>> # the linear classifier still needs to be trained
+        >>> logits = outputs[0]
+"""
+TF_CAUSAL_LM_SAMPLE = r"""
+    Example::
+        >>> from transformers import {tokenizer_class}, {model_class}
+        >>> import tensorflow as tf
+        >>> tokenizer = {tokenizer_class}.from_pretrained('{checkpoint}')
+        >>> model = {model_class}.from_pretrained('{checkpoint}')
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="tf")
+        >>> outputs = model(inputs)
+        >>> logits = outputs[0]
+"""
+def add_code_sample_docstrings(*docstr, tokenizer_class=None, checkpoint=None):
+    def docstring_decorator(fn):
+        model_class = fn.__qualname__.split(".")[0]
+        is_tf_class = model_class[:2] == "TF"
+        if "SequenceClassification" in model_class:
+            code_sample = TF_SEQUENCE_CLASSIFICATION_SAMPLE if is_tf_class else PT_SEQUENCE_CLASSIFICATION_SAMPLE
+        elif "QuestionAnswering" in model_class:
+            code_sample = TF_QUESTION_ANSWERING_SAMPLE if is_tf_class else PT_QUESTION_ANSWERING_SAMPLE
+        elif "TokenClassification" in model_class:
+            code_sample = TF_TOKEN_CLASSIFICATION_SAMPLE if is_tf_class else PT_TOKEN_CLASSIFICATION_SAMPLE
+        elif "MultipleChoice" in model_class:
+            code_sample = TF_MULTIPLE_CHOICE_SAMPLE if is_tf_class else PT_MULTIPLE_CHOICE_SAMPLE
+        elif "MaskedLM" in model_class:
+            code_sample = TF_MASKED_LM_SAMPLE if is_tf_class else PT_MASKED_LM_SAMPLE
+        elif "LMHead" in model_class:
+            code_sample = TF_CAUSAL_LM_SAMPLE if is_tf_class else PT_CAUSAL_LM_SAMPLE
+        elif "Model" in model_class:
+            code_sample = TF_BASE_MODEL_SAMPLE if is_tf_class else PT_BASE_MODEL_SAMPLE
+        else:
+            raise ValueError(f"Docstring can't be built for model {model_class}")
+        built_doc = code_sample.format(model_class=model_class, tokenizer_class=tokenizer_class, checkpoint=checkpoint)
+        fn.__doc__ = (fn.__doc__ or "") + "".join(docstr) + built_doc
+        return fn
+    return docstring_decorator
+def is_remote_url(url_or_filename):
+    parsed = urlparse(url_or_filename)
+    return parsed.scheme in ("http", "https")
+def hf_bucket_url(model_id: str, filename: str, use_cdn=True) -> str:
+    """
+    Resolve a model identifier, and a file name, to a HF-hosted url
+    on either S3 or Cloudfront (a Content Delivery Network, or CDN).
+    Cloudfront is replicated over the globe so downloads are way faster
+    for the end user (and it also lowers our bandwidth costs). However, it
+    is more aggressively cached by default, so may not always reflect the
+    latest changes to the underlying file (default TTL is 24 hours).
+    In terms of client-side caching from this library, even though
+    Cloudfront relays the ETags from S3, using one or the other
+    (or switching from one to the other) will affect caching: cached files
+    are not shared between the two because the cached file's name contains
+    a hash of the url.
+    """
+    endpoint = CLOUDFRONT_DISTRIB_PREFIX if use_cdn else S3_BUCKET_PREFIX
+    legacy_format = "/" not in model_id
+    if legacy_format:
+        return f"{endpoint}/{model_id}-{filename}"
+    else:
+        return f"{endpoint}/{model_id}/{filename}"
+def url_to_filename(url, etag=None):
+    """
+    Convert `url` into a hashed filename in a repeatable way.
+    If `etag` is specified, append its hash to the url's, delimited
+    by a period.
+    If the url ends with .h5 (Keras HDF5 weights) adds '.h5' to the name
+    so that TF 2.0 can identify it as a HDF5 file
+    (see https://github.com/tensorflow/tensorflow/blob/00fad90125b18b80fe054de1055770cfb8fe4ba3/tensorflow/python/keras/engine/network.py#L1380)
+    """
+    url_bytes = url.encode("utf-8")
+    url_hash = sha256(url_bytes)
+    filename = url_hash.hexdigest()
+    if etag:
+        etag_bytes = etag.encode("utf-8")
+        etag_hash = sha256(etag_bytes)
+        filename += "." + etag_hash.hexdigest()
+    if url.endswith(".h5"):
+        filename += ".h5"
+    return filename
+def filename_to_url(filename, cache_dir=None):
+    """
+    Return the url and etag (which may be ``None``) stored for `filename`.
+    Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist.
+    """
+    if cache_dir is None:
+        cache_dir = TRANSFORMERS_CACHE
+    if isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)
+    cache_path = os.path.join(cache_dir, filename)
+    if not os.path.exists(cache_path):
+        raise EnvironmentError("file {} not found".format(cache_path))
+    meta_path = cache_path + ".json"
+    if not os.path.exists(meta_path):
+        raise EnvironmentError("file {} not found".format(meta_path))
+    with open(meta_path, encoding="utf-8") as meta_file:
+        metadata = json.load(meta_file)
+    url = metadata["url"]
+    etag = metadata["etag"]
+    return url, etag
+def cached_path(
+    url_or_filename,
+    cache_dir=None,
+    force_download=False,
+    proxies=None,
+    resume_download=False,
+    user_agent: Union[Dict, str, None] = None,
+    extract_compressed_file=False,
+    force_extract=False,
+    local_files_only=False,
+) -> Optional[str]:
+    """
+    Given something that might be a URL (or might be a local path),
+    determine which. If it's a URL, download the file and cache it, and
+    return the path to the cached file. If it's already a local path,
+    make sure the file exists and then return the path.
+    Args:
+        cache_dir: specify a cache directory to save the file to (overwrite the default cache dir).
+        force_download: if True, re-dowload the file even if it's already cached in the cache dir.
+        resume_download: if True, resume the download if incompletly recieved file is found.
+        user_agent: Optional string or dict that will be appended to the user-agent on remote requests.
+        extract_compressed_file: if True and the path point to a zip or tar file, extract the compressed
+            file in a folder along the archive.
+        force_extract: if True when extract_compressed_file is True and the archive was already extracted,
+            re-extract the archive and overide the folder where it was extracted.
+    Return:
+        None in case of non-recoverable file (non-existent or inaccessible url + no cache on disk).
+        Local path (string) otherwise
+    """
+    if cache_dir is None:
+        cache_dir = TRANSFORMERS_CACHE
+    if isinstance(url_or_filename, Path):
+        url_or_filename = str(url_or_filename)
+    if isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)
+    if is_remote_url(url_or_filename):
+        # URL, so get it from the cache (downloading if necessary)
+        output_path = get_from_cache(
+            url_or_filename,
+            cache_dir=cache_dir,
+            force_download=force_download,
+            proxies=proxies,
+            resume_download=resume_download,
+            user_agent=user_agent,
+            local_files_only=local_files_only,
+        )
+    elif os.path.exists(url_or_filename):
+        # File, and it exists.
+        output_path = url_or_filename
+    elif urlparse(url_or_filename).scheme == "":
+        # File, but it doesn't exist.
+        raise EnvironmentError("file {} not found".format(url_or_filename))
+    else:
+        # Something unknown
+        raise ValueError("unable to parse {} as a URL or as a local path".format(url_or_filename))
+    if extract_compressed_file:
+        if not is_zipfile(output_path) and not tarfile.is_tarfile(output_path):
+            return output_path
+        # Path where we extract compressed archives
+        # We avoid '.' in dir name and add "-extracted" at the end: "./model.zip" => "./model-zip-extracted/"
+        output_dir, output_file = os.path.split(output_path)
+        output_extract_dir_name = output_file.replace(".", "-") + "-extracted"
+        output_path_extracted = os.path.join(output_dir, output_extract_dir_name)
+        if os.path.isdir(output_path_extracted) and os.listdir(output_path_extracted) and not force_extract:
+            return output_path_extracted
+        # Prevent parallel extractions
+        lock_path = output_path + ".lock"
+        with FileLock(lock_path):
+            shutil.rmtree(output_path_extracted, ignore_errors=True)
+            os.makedirs(output_path_extracted)
+            if is_zipfile(output_path):
+                with ZipFile(output_path, "r") as zip_file:
+                    zip_file.extractall(output_path_extracted)
+                    zip_file.close()
+            elif tarfile.is_tarfile(output_path):
+                tar_file = tarfile.open(output_path)
+                tar_file.extractall(output_path_extracted)
+                tar_file.close()
+            else:
+                raise EnvironmentError("Archive format of {} could not be identified".format(output_path))
+        return output_path_extracted
+    return output_path
+def http_get(url, temp_file, proxies=None, resume_size=0, user_agent: Union[Dict, str, None] = None):
+    ua = "transformers/{}; python/{}".format(__version__, sys.version.split()[0])
+    if is_torch_available():
+        ua += "; torch/{}".format(torch.__version__)
+    if is_tf_available():
+        ua += "; tensorflow/{}".format(tf.__version__)
+    if isinstance(user_agent, dict):
+        ua += "; " + "; ".join("{}/{}".format(k, v) for k, v in user_agent.items())
+    elif isinstance(user_agent, str):
+        ua += "; " + user_agent
+    headers = {"user-agent": ua}
+    if resume_size > 0:
+        headers["Range"] = "bytes=%d-" % (resume_size,)
+    response = requests.get(url, stream=True, proxies=proxies, headers=headers)
+    if response.status_code == 416:  # Range not satisfiable
+        return
+    content_length = response.headers.get("Content-Length")
+    total = resume_size + int(content_length) if content_length is not None else None
+    progress = tqdm(
+        unit="B",
+        unit_scale=True,
+        total=total,
+        initial=resume_size,
+        desc="Downloading",
+        disable=bool(logger.getEffectiveLevel() == logging.NOTSET),
+    )
+    for chunk in response.iter_content(chunk_size=1024):
+        if chunk:  # filter out keep-alive new chunks
+            progress.update(len(chunk))
+            temp_file.write(chunk)
+    progress.close()
+def get_from_cache(
+    url,
+    cache_dir=None,
+    force_download=False,
+    proxies=None,
+    etag_timeout=10,
+    resume_download=False,
+    user_agent: Union[Dict, str, None] = None,
+    local_files_only=False,
+) -> Optional[str]:
+    """
+    Given a URL, look for the corresponding file in the local cache.
+    If it's not there, download it. Then return the path to the cached file.
+    Return:
+        None in case of non-recoverable file (non-existent or inaccessible url + no cache on disk).
+        Local path (string) otherwise
+    """
+    if cache_dir is None:
+        cache_dir = TRANSFORMERS_CACHE
+    if isinstance(cache_dir, Path):
+        cache_dir = str(cache_dir)
+    os.makedirs(cache_dir, exist_ok=True)
+    etag = None
+    if not local_files_only:
+        try:
+            response = requests.head(url, allow_redirects=True, proxies=proxies, timeout=etag_timeout)
+            if response.status_code == 200:
+                etag = response.headers.get("ETag")
+        except (EnvironmentError, requests.exceptions.Timeout):
+            # etag is already None
+            pass
+    filename = url_to_filename(url, etag)
+    # get cache path to put the file
+    cache_path = os.path.join(cache_dir, filename)
+    # etag is None = we don't have a connection, or url doesn't exist, or is otherwise inaccessible.
+    # try to get the last downloaded one
+    if etag is None:
+        if os.path.exists(cache_path):
+            return cache_path
+        else:
+            matching_files = [
+                file
+                for file in fnmatch.filter(os.listdir(cache_dir), filename + ".*")
+                if not file.endswith(".json") and not file.endswith(".lock")
+            ]
+            if len(matching_files) > 0:
+                return os.path.join(cache_dir, matching_files[-1])
+            else:
+                # If files cannot be found and local_files_only=True,
+                # the models might've been found if local_files_only=False
+                # Notify the user about that
+                if local_files_only:
+                    raise ValueError(
+                        "Cannot find the requested files in the cached path and outgoing traffic has been"
+                        " disabled. To enable model look-ups and downloads online, set 'local_files_only'"
+                        " to False."
+                    )
+                return None
+    # From now on, etag is not None.
+    if os.path.exists(cache_path) and not force_download:
+        return cache_path
+    # Prevent parallel downloads of the same file with a lock.
+    lock_path = cache_path + ".lock"
+    with FileLock(lock_path):
+        # If the download just completed while the lock was activated.
+        if os.path.exists(cache_path) and not force_download:
+            # Even if returning early like here, the lock will be released.
+            return cache_path
+        if resume_download:
+            incomplete_path = cache_path + ".incomplete"
+            @contextmanager
+            def _resumable_file_manager():
+                with open(incomplete_path, "a+b") as f:
+                    yield f
+            temp_file_manager = _resumable_file_manager
+            if os.path.exists(incomplete_path):
+                resume_size = os.stat(incomplete_path).st_size
+            else:
+                resume_size = 0
+        else:
+            temp_file_manager = partial(tempfile.NamedTemporaryFile, dir=cache_dir, delete=False)
+            resume_size = 0
+        # Download to temporary file, then copy to cache dir once finished.
+        # Otherwise you get corrupt cache entries if the download gets interrupted.
+        with temp_file_manager() as temp_file:
+            logger.info("%s not found in cache or force_download set to True, downloading to %s", url, temp_file.name)
+            http_get(url, temp_file, proxies=proxies, resume_size=resume_size, user_agent=user_agent)
+        logger.info("storing %s in cache at %s", url, cache_path)
+        os.replace(temp_file.name, cache_path)
+        logger.info("creating metadata file for %s", cache_path)
+        meta = {"url": url, "etag": etag}
+        meta_path = cache_path + ".json"
+        with open(meta_path, "w") as meta_file:
+            json.dump(meta, meta_file)
+    return cache_path
+class cached_property(property):
+    """
+    Descriptor that mimics @property but caches output in member variable.
+    From tensorflow_datasets
+    Built-in in functools from Python 3.8.
+    """
+    def __get__(self, obj, objtype=None):
+        # See docs.python.org/3/howto/descriptor.html#properties
+        if obj is None:
+            return self
+        if self.fget is None:
+            raise AttributeError("unreadable attribute")
+        attr = "__cached_" + self.fget.__name__
+        cached = getattr(obj, attr, None)
+        if cached is None:
+            cached = self.fget(obj)
+            setattr(obj, attr, cached)
+        return cached
+def torch_required(func):
+    # Chose a different decorator name than in tests so it's clear they are not the same.
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        if is_torch_available():
+            return func(*args, **kwargs)
+        else:
+            raise ImportError(f"Method `{func.__name__}` requires PyTorch.")
+    return wrapper
+def tf_required(func):
+    # Chose a different decorator name than in tests so it's clear they are not the same.
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        if is_tf_available():
+            return func(*args, **kwargs)
+        else:
+            raise ImportError(f"Method `{func.__name__}` requires TF.")
+    return wrapper

CGFormer/bert/generation_utils.py ADDED Viewed

	@@ -0,0 +1,993 @@

+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors, Facebook AI Research authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+from typing import Iterable, Optional, Tuple
+import torch
+from torch import Tensor
+from torch.nn import functional as F
+logger = logging.getLogger(__name__)
+class GenerationMixin:
+    """
+    A class contraining all of the functions supporting generation, to be used as a mixin in PreTrainedModel.
+    """
+    def prepare_inputs_for_generation(self, input_ids, **kwargs):
+        return {"input_ids": input_ids}
+    def adjust_logits_during_generation(self, logits, **kwargs):
+        return logits
+    def _use_cache(self, outputs, use_cache):
+        """During generation, decide whether to pass the `past` variable to the next forward pass."""
+        if len(outputs) <= 1 or use_cache is False:
+            return False
+        if hasattr(self.config, "mem_len") and self.config.mem_len == 0:
+            return False
+        return True
+    def enforce_repetition_penalty_(self, lprobs, batch_size, num_beams, prev_output_tokens, repetition_penalty):
+        """repetition penalty (from CTRL paper https://arxiv.org/abs/1909.05858). """
+        for i in range(batch_size * num_beams):
+            for previous_token in set(prev_output_tokens[i].tolist()):
+                # if score < 0 then repetition penalty has to multiplied to reduce the previous token probability
+                if lprobs[i, previous_token] < 0:
+                    lprobs[i, previous_token] *= repetition_penalty
+                else:
+                    lprobs[i, previous_token] /= repetition_penalty
+    def postprocess_next_token_scores(
+        self,
+        scores,
+        input_ids,
+        no_repeat_ngram_size,
+        bad_words_ids,
+        cur_len,
+        min_length,
+        max_length,
+        eos_token_id,
+        repetition_penalty,
+        batch_size,
+        num_beams,
+    ):
+        # repetition penalty (from CTRL paper https://arxiv.org/abs/1909.05858)
+        if repetition_penalty != 1.0:
+            self.enforce_repetition_penalty_(
+                scores, batch_size, num_beams, input_ids, repetition_penalty,
+            )
+        # set eos token prob to zero if min_length is not reached
+        if eos_token_id is not None and cur_len < min_length:
+            scores[:, eos_token_id] = -float("inf")
+        if no_repeat_ngram_size > 0:
+            # calculate a list of banned tokens to prevent repetitively generating the same ngrams
+            num_batch_hypotheses = batch_size * num_beams
+            # from fairseq: https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345
+            banned_batch_tokens = calc_banned_ngram_tokens(
+                input_ids, num_batch_hypotheses, no_repeat_ngram_size, cur_len
+            )
+            for i, banned_tokens in enumerate(banned_batch_tokens):
+                scores[i, banned_tokens] = -float("inf")
+        if bad_words_ids is not None:
+            # calculate a list of banned tokens according to bad words
+            banned_tokens = calc_banned_bad_words_ids(input_ids, bad_words_ids)
+            for i, banned_tokens in enumerate(banned_tokens):
+                scores[i, banned_tokens] = -float("inf")
+        return scores
+    @torch.no_grad()
+    def generate(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        max_length: Optional[int] = None,
+        min_length: Optional[int] = None,
+        do_sample: Optional[bool] = None,
+        early_stopping: Optional[bool] = None,
+        num_beams: Optional[int] = None,
+        temperature: Optional[float] = None,
+        top_k: Optional[int] = None,
+        top_p: Optional[float] = None,
+        repetition_penalty: Optional[float] = None,
+        bad_words_ids: Optional[Iterable[int]] = None,
+        bos_token_id: Optional[int] = None,
+        pad_token_id: Optional[int] = None,
+        eos_token_id: Optional[int] = None,
+        length_penalty: Optional[float] = None,
+        no_repeat_ngram_size: Optional[int] = None,
+        num_return_sequences: Optional[int] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        decoder_start_token_id: Optional[int] = None,
+        use_cache: Optional[bool] = None,
+        **model_specific_kwargs
+    ) -> torch.LongTensor:
+        r""" Generates sequences for models with a LM head. The method currently supports greedy decoding, beam-search decoding, sampling with temperature, sampling with top-k or nucleus sampling.
+        Adapted in part from `Facebook's XLM beam search code`_.
+        .. _`Facebook's XLM beam search code`:
+           https://github.com/facebookresearch/XLM/blob/9e6f6814d17be4fe5b15f2e6c43eb2b2d76daeb4/src/model/transformer.py#L529
+        Parameters:
+            input_ids: (`optional`) `torch.LongTensor` of shape `(batch_size, sequence_length)`
+                The sequence used as a prompt for the generation. If `None` the method initializes
+                it as an empty `torch.LongTensor` of shape `(1,)`.
+            max_length: (`optional`) int
+                The max length of the sequence to be generated.  Between `min_length` and infinity. Default to 20.
+            min_length: (`optional`) int
+                The min length of the sequence to be generated.  Between 0 and infinity. Default to 0.
+            do_sample: (`optional`) bool
+                If set to `False` greedy decoding is used. Otherwise sampling is used. Defaults to `False` as defined in `configuration_utils.PretrainedConfig`.
+            early_stopping: (`optional`) bool
+                if set to `True` beam search is stopped when at least `num_beams` sentences finished per batch. Defaults to `False` as defined in `configuration_utils.PretrainedConfig`.
+            num_beams: (`optional`) int
+                Number of beams for beam search. Must be between 1 and infinity. 1 means no beam search. Default to 1.
+            temperature: (`optional`) float
+                The value used to module the next token probabilities. Must be strictly positive. Default to 1.0.
+            top_k: (`optional`) int
+                The number of highest probability vocabulary tokens to keep for top-k-filtering. Between 1 and infinity. Default to 50.
+            top_p: (`optional`) float
+                The cumulative probability of parameter highest probability vocabulary tokens to keep for nucleus sampling. Must be between 0 and 1. Default to 1.
+            repetition_penalty: (`optional`) float
+                The parameter for repetition penalty. Between 1.0 and infinity. 1.0 means no penalty. Default to 1.0.
+            pad_token_id: (`optional`) int
+                Padding token. Default to specicic model pad_token_id or None if it does not exist.
+            bos_token_id: (`optional`) int
+                BOS token. Defaults to `bos_token_id` as defined in the models config.
+            eos_token_id: (`optional`) int
+                EOS token. Defaults to `eos_token_id` as defined in the models config.
+            length_penalty: (`optional`) float
+                Exponential penalty to the length. Default to 1.
+            no_repeat_ngram_size: (`optional`) int
+                If set to int > 0, all ngrams of size `no_repeat_ngram_size` can only occur once.
+            bad_words_ids: (`optional`) list of lists of int
+                `bad_words_ids` contains tokens that are not allowed to be generated. In order to get the tokens of the words that should not appear in the generated text, use `tokenizer.encode(bad_word, add_prefix_space=True)`.
+            num_return_sequences: (`optional`) int
+                The number of independently computed returned sequences for each element in the batch. Default to 1.
+            attention_mask (`optional`) obj: `torch.LongTensor` of same shape as `input_ids`
+                Mask to avoid performing attention on padding token indices.
+                Mask values selected in ``[0, 1]``:
+                ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
+                Defaults to `None`.
+                `What are attention masks? <../glossary.html#attention-mask>`__
+            decoder_start_token_id=None: (`optional`) int
+                If an encoder-decoder model starts decoding with a different token than BOS.
+                Defaults to `None` and is changed to `BOS` later.
+            use_cache: (`optional`) bool
+                If `use_cache` is True, past key values are used to speed up decoding if applicable to model. Defaults to `True`.
+            model_specific_kwargs: (`optional`) dict
+                Additional model specific kwargs will be forwarded to the `forward` function of the model.
+        Return:
+            output: `torch.LongTensor` of shape `(batch_size * num_return_sequences, sequence_length)`
+                sequence_length is either equal to max_length or shorter if all batches finished early due to the `eos_token_id`
+        Examples::
+            tokenizer = AutoTokenizer.from_pretrained('distilgpt2')   # Initialize tokenizer
+            model = AutoModelWithLMHead.from_pretrained('distilgpt2')    # Download model and configuration from S3 and cache.
+            outputs = model.generate(max_length=40)  # do greedy decoding
+            print('Generated: {}'.format(tokenizer.decode(outputs[0], skip_special_tokens=True)))
+            tokenizer = AutoTokenizer.from_pretrained('openai-gpt')   # Initialize tokenizer
+            model = AutoModelWithLMHead.from_pretrained('openai-gpt')    # Download model and configuration from S3 and cache.
+            input_context = 'The dog'
+            input_ids = tokenizer.encode(input_context, return_tensors='pt')  # encode input context
+            outputs = model.generate(input_ids=input_ids, num_beams=5, num_return_sequences=3, temperature=1.5)  # generate 3 independent sequences using beam search decoding (5 beams) with sampling from initial context 'The dog'
+            for i in range(3): #  3 output sequences were generated
+                print('Generated {}: {}'.format(i, tokenizer.decode(outputs[i], skip_special_tokens=True)))
+            tokenizer = AutoTokenizer.from_pretrained('distilgpt2')   # Initialize tokenizer
+            model = AutoModelWithLMHead.from_pretrained('distilgpt2')    # Download model and configuration from S3 and cache.
+            input_context = 'The dog'
+            input_ids = tokenizer.encode(input_context, return_tensors='pt')  # encode input context
+            outputs = model.generate(input_ids=input_ids, max_length=40, temperature=0.7, num_return_sequences=3)  # 3 generate sequences using by sampling
+            for i in range(3): #  3 output sequences were generated
+                print('Generated {}: {}'.format(i, tokenizer.decode(outputs[i], skip_special_tokens=True)))
+            tokenizer = AutoTokenizer.from_pretrained('ctrl')   # Initialize tokenizer
+            model = AutoModelWithLMHead.from_pretrained('ctrl')    # Download model and configuration from S3 and cache.
+            input_context = 'Legal My neighbor is'  # "Legal" is one of the control codes for ctrl
+            input_ids = tokenizer.encode(input_context, return_tensors='pt')  # encode input context
+            outputs = model.generate(input_ids=input_ids, max_length=50, temperature=0.7, repetition_penalty=1.2)  # generate sequences
+            print('Generated: {}'.format(tokenizer.decode(outputs[0], skip_special_tokens=True)))
+            tokenizer = AutoTokenizer.from_pretrained('gpt2')   # Initialize tokenizer
+            model = AutoModelWithLMHead.from_pretrained('gpt2')    # Download model and configuration from S3 and cache.
+            input_context = 'My cute dog'  # "Legal" is one of the control codes for ctrl
+            bad_words_ids = [tokenizer.encode(bad_word, add_prefix_space=True) for bad_word in ['idiot', 'stupid', 'shut up']]
+            input_ids = tokenizer.encode(input_context, return_tensors='pt')  # encode input context
+            outputs = model.generate(input_ids=input_ids, max_length=100, do_sample=True, bad_words_ids=bad_words_ids)  # generate sequences without allowing bad_words to be generated
+        """
+        # We cannot generate if the model does not have a LM head
+        if self.get_output_embeddings() is None:
+            raise AttributeError(
+                "You tried to generate sequences with a model that does not have a LM Head."
+                "Please use another model class (e.g. `OpenAIGPTLMHeadModel`, `XLNetLMHeadModel`, `GPT2LMHeadModel`, `CTRLLMHeadModel`, `T5WithLMHeadModel`, `TransfoXLLMHeadModel`, `XLMWithLMHeadModel`, `BartForConditionalGeneration` )"
+            )
+        max_length = max_length if max_length is not None else self.config.max_length
+        min_length = min_length if min_length is not None else self.config.min_length
+        do_sample = do_sample if do_sample is not None else self.config.do_sample
+        early_stopping = early_stopping if early_stopping is not None else self.config.early_stopping
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        num_beams = num_beams if num_beams is not None else self.config.num_beams
+        temperature = temperature if temperature is not None else self.config.temperature
+        top_k = top_k if top_k is not None else self.config.top_k
+        top_p = top_p if top_p is not None else self.config.top_p
+        repetition_penalty = repetition_penalty if repetition_penalty is not None else self.config.repetition_penalty
+        bos_token_id = bos_token_id if bos_token_id is not None else self.config.bos_token_id
+        pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id
+        eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id
+        length_penalty = length_penalty if length_penalty is not None else self.config.length_penalty
+        no_repeat_ngram_size = (
+            no_repeat_ngram_size if no_repeat_ngram_size is not None else self.config.no_repeat_ngram_size
+        )
+        bad_words_ids = bad_words_ids if bad_words_ids is not None else self.config.bad_words_ids
+        num_return_sequences = (
+            num_return_sequences if num_return_sequences is not None else self.config.num_return_sequences
+        )
+        decoder_start_token_id = (
+            decoder_start_token_id if decoder_start_token_id is not None else self.config.decoder_start_token_id
+        )
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]  # overriden by the input batch_size
+        else:
+            batch_size = 1
+        assert isinstance(max_length, int) and max_length > 0, "`max_length` should be a strictly positive integer."
+        assert isinstance(min_length, int) and min_length >= 0, "`min_length` should be a positive integer."
+        assert isinstance(do_sample, bool), "`do_sample` should be a boolean."
+        assert isinstance(early_stopping, bool), "`early_stopping` should be a boolean."
+        assert isinstance(use_cache, bool), "`use_cache` should be a boolean."
+        assert isinstance(num_beams, int) and num_beams > 0, "`num_beams` should be a strictly positive integer."
+        assert temperature > 0, "`temperature` should be strictly positive."
+        assert isinstance(top_k, int) and top_k >= 0, "`top_k` should be a positive integer."
+        assert 0 <= top_p <= 1, "`top_p` should be between 0 and 1."
+        assert repetition_penalty >= 1.0, "`repetition_penalty` should be >= 1."
+        assert input_ids is not None or (
+            isinstance(bos_token_id, int) and bos_token_id >= 0
+        ), "If input_ids is not defined, `bos_token_id` should be a positive integer."
+        assert pad_token_id is None or (
+            isinstance(pad_token_id, int) and (pad_token_id >= 0)
+        ), "`pad_token_id` should be a positive integer."
+        assert (eos_token_id is None) or (
+            isinstance(eos_token_id, int) and (eos_token_id >= 0)
+        ), "`eos_token_id` should be a positive integer."
+        assert length_penalty > 0, "`length_penalty` should be strictly positive."
+        assert (
+            isinstance(no_repeat_ngram_size, int) and no_repeat_ngram_size >= 0
+        ), "`no_repeat_ngram_size` should be a positive integer."
+        assert (
+            isinstance(num_return_sequences, int) and num_return_sequences > 0
+        ), "`num_return_sequences` should be a strictly positive integer."
+        assert (
+            bad_words_ids is None or isinstance(bad_words_ids, list) and isinstance(bad_words_ids[0], list)
+        ), "`bad_words_ids` is either `None` or a list of lists of tokens that should not be generated"
+        if input_ids is None:
+            assert isinstance(bos_token_id, int) and bos_token_id >= 0, (
+                "you should either supply a context to complete as `input_ids` input "
+                "or a `bos_token_id` (integer >= 0) as a first token to start the generation."
+            )
+            input_ids = torch.full(
+                (batch_size, 1), bos_token_id, dtype=torch.long, device=next(self.parameters()).device,
+            )
+        else:
+            assert input_ids.dim() == 2, "Input prompt should be of shape (batch_size, sequence length)."
+        # not allow to duplicate outputs when greedy decoding
+        if do_sample is False:
+            if num_beams == 1:
+                # no_beam_search greedy generation conditions
+                assert (
+                    num_return_sequences == 1
+                ), "Greedy decoding will always produce the same output for num_beams == 1 and num_return_sequences > 1. Please set num_return_sequences = 1"
+            else:
+                # beam_search greedy generation conditions
+                assert (
+                    num_beams >= num_return_sequences
+                ), "Greedy beam search decoding cannot return more sequences than it has beams. Please set num_beams >= num_return_sequences"
+        # create attention mask if necessary
+        # TODO (PVP): this should later be handled by the forward fn() in each model in the future see PR 3140
+        if (attention_mask is None) and (pad_token_id is not None) and (pad_token_id in input_ids):
+            attention_mask = input_ids.ne(pad_token_id).long()
+        elif attention_mask is None:
+            attention_mask = input_ids.new_ones(input_ids.shape)
+        # set pad_token_id to eos_token_id if not set. Important that this is done after
+        # attention_mask is created
+        if pad_token_id is None and eos_token_id is not None:
+            logger.warning(
+                "Setting `pad_token_id` to {} (first `eos_token_id`) to generate sequence".format(eos_token_id)
+            )
+            pad_token_id = eos_token_id
+        # current position and vocab size
+        if hasattr(self.config, "vocab_size"):
+            vocab_size = self.config.vocab_size
+        elif (
+            self.config.is_encoder_decoder
+            and hasattr(self.config, "decoder")
+            and hasattr(self.config.decoder, "vocab_size")
+        ):
+            vocab_size = self.config.decoder.vocab_size
+        # set effective batch size and effective batch multiplier according to do_sample
+        if do_sample:
+            effective_batch_size = batch_size * num_return_sequences
+            effective_batch_mult = num_return_sequences
+        else:
+            effective_batch_size = batch_size
+            effective_batch_mult = 1
+        if self.config.is_encoder_decoder:
+            if decoder_start_token_id is None:
+                decoder_start_token_id = bos_token_id
+            assert (
+                decoder_start_token_id is not None
+            ), "decoder_start_token_id or bos_token_id has to be defined for encoder-decoder generation"
+            assert hasattr(self, "get_encoder"), "{} should have a 'get_encoder' function defined".format(self)
+            assert callable(self.get_encoder), "{} should be a method".format(self.get_encoder)
+            # get encoder and store encoder outputs
+            encoder = self.get_encoder()
+            encoder_outputs: tuple = encoder(input_ids, attention_mask=attention_mask)
+        # Expand input ids if num_beams > 1 or num_return_sequences > 1
+        if num_return_sequences > 1 or num_beams > 1:
+            input_ids_len = input_ids.shape[-1]
+            input_ids = input_ids.unsqueeze(1).expand(batch_size, effective_batch_mult * num_beams, input_ids_len)
+            attention_mask = attention_mask.unsqueeze(1).expand(
+                batch_size, effective_batch_mult * num_beams, input_ids_len
+            )
+            input_ids = input_ids.contiguous().view(
+                effective_batch_size * num_beams, input_ids_len
+            )  # shape: (batch_size * num_return_sequences * num_beams, cur_len)
+            attention_mask = attention_mask.contiguous().view(
+                effective_batch_size * num_beams, input_ids_len
+            )  # shape: (batch_size * num_return_sequences * num_beams, cur_len)
+        if self.config.is_encoder_decoder:
+            # create empty decoder_input_ids
+            input_ids = torch.full(
+                (effective_batch_size * num_beams, 1),
+                decoder_start_token_id,
+                dtype=torch.long,
+                device=next(self.parameters()).device,
+            )
+            cur_len = 1
+            assert (
+                batch_size == encoder_outputs[0].shape[0]
+            ), f"expected encoder_outputs[0] to have 1st dimension bs={batch_size}, got {encoder_outputs[0].shape[0]} "
+            # expand batch_idx to assign correct encoder output for expanded input_ids (due to num_beams > 1 and num_return_sequences > 1)
+            expanded_batch_idxs = (
+                torch.arange(batch_size)
+                .view(-1, 1)
+                .repeat(1, num_beams * effective_batch_mult)
+                .view(-1)
+                .to(input_ids.device)
+            )
+            # expand encoder_outputs
+            encoder_outputs = (encoder_outputs[0].index_select(0, expanded_batch_idxs), *encoder_outputs[1:])
+        else:
+            encoder_outputs = None
+            cur_len = input_ids.shape[-1]
+        assert (
+            cur_len < max_length
+        ), f"The context has {cur_len} number of tokens, but `max_length` is only {max_length}. Please make sure that `max_length` is bigger than the number of tokens, by setting either `generate(max_length=...,...)` or `config.max_length = ...`"
+        if num_beams > 1:
+            output = self._generate_beam_search(
+                input_ids,
+                cur_len=cur_len,
+                max_length=max_length,
+                min_length=min_length,
+                do_sample=do_sample,
+                early_stopping=early_stopping,
+                temperature=temperature,
+                top_k=top_k,
+                top_p=top_p,
+                repetition_penalty=repetition_penalty,
+                no_repeat_ngram_size=no_repeat_ngram_size,
+                bad_words_ids=bad_words_ids,
+                pad_token_id=pad_token_id,
+                eos_token_id=eos_token_id,
+                batch_size=effective_batch_size,
+                num_return_sequences=num_return_sequences,
+                length_penalty=length_penalty,
+                num_beams=num_beams,
+                vocab_size=vocab_size,
+                encoder_outputs=encoder_outputs,
+                attention_mask=attention_mask,
+                use_cache=use_cache,
+                model_specific_kwargs=model_specific_kwargs,
+            )
+        else:
+            output = self._generate_no_beam_search(
+                input_ids,
+                cur_len=cur_len,
+                max_length=max_length,
+                min_length=min_length,
+                do_sample=do_sample,
+                temperature=temperature,
+                top_k=top_k,
+                top_p=top_p,
+                repetition_penalty=repetition_penalty,
+                no_repeat_ngram_size=no_repeat_ngram_size,
+                bad_words_ids=bad_words_ids,
+                pad_token_id=pad_token_id,
+                eos_token_id=eos_token_id,
+                batch_size=effective_batch_size,
+                encoder_outputs=encoder_outputs,
+                attention_mask=attention_mask,
+                use_cache=use_cache,
+                model_specific_kwargs=model_specific_kwargs,
+            )
+        return output
+    def _generate_no_beam_search(
+        self,
+        input_ids,
+        cur_len,
+        max_length,
+        min_length,
+        do_sample,
+        temperature,
+        top_k,
+        top_p,
+        repetition_penalty,
+        no_repeat_ngram_size,
+        bad_words_ids,
+        pad_token_id,
+        eos_token_id,
+        batch_size,
+        encoder_outputs,
+        attention_mask,
+        use_cache,
+        model_specific_kwargs,
+    ):
+        """ Generate sequences for each example without beam search (num_beams == 1).
+            All returned sequence are generated independantly.
+        """
+        # length of generated sentences / unfinished sentences
+        unfinished_sents = input_ids.new(batch_size).fill_(1)
+        sent_lengths = input_ids.new(batch_size).fill_(max_length)
+        past = (encoder_outputs, None) if encoder_outputs is not None else None
+        while cur_len < max_length:
+            model_inputs = self.prepare_inputs_for_generation(
+                input_ids, past=past, attention_mask=attention_mask, use_cache=use_cache, **model_specific_kwargs
+            )
+            outputs = self(**model_inputs)
+            next_token_logits = outputs[0][:, -1, :]
+            scores = self.postprocess_next_token_scores(
+                scores=next_token_logits,
+                input_ids=input_ids,
+                no_repeat_ngram_size=no_repeat_ngram_size,
+                bad_words_ids=bad_words_ids,
+                cur_len=cur_len,
+                min_length=min_length,
+                max_length=max_length,
+                eos_token_id=eos_token_id,
+                repetition_penalty=repetition_penalty,
+                batch_size=batch_size,
+                num_beams=1,
+            )
+            # if model has past, then set the past variable to speed up decoding
+            if self._use_cache(outputs, use_cache):
+                past = outputs[1]
+            if do_sample:
+                # Temperature (higher temperature => more likely to sample low probability tokens)
+                if temperature != 1.0:
+                    scores = scores / temperature
+                # Top-p/top-k filtering
+                next_token_logscores = top_k_top_p_filtering(scores, top_k=top_k, top_p=top_p)
+                # Sample
+                probs = F.softmax(next_token_logscores, dim=-1)
+                next_token = torch.multinomial(probs, num_samples=1).squeeze(1)
+            else:
+                # Greedy decoding
+                next_token = torch.argmax(next_token_logits, dim=-1)
+            # update generations and finished sentences
+            if eos_token_id is not None:
+                # pad finished sentences if eos_token_id exist
+                tokens_to_add = next_token * unfinished_sents + (pad_token_id) * (1 - unfinished_sents)
+            else:
+                tokens_to_add = next_token
+            # add token and increase length by one
+            input_ids = torch.cat([input_ids, tokens_to_add.unsqueeze(-1)], dim=-1)
+            cur_len = cur_len + 1
+            if eos_token_id is not None:
+                eos_in_sents = tokens_to_add == eos_token_id
+                # if sentence is unfinished and the token to add is eos, sent_lengths is filled with current length
+                is_sents_unfinished_and_token_to_add_is_eos = unfinished_sents.mul(eos_in_sents.long()).bool()
+                sent_lengths.masked_fill_(is_sents_unfinished_and_token_to_add_is_eos, cur_len)
+                # unfinished_sents is set to zero if eos in sentence
+                unfinished_sents.mul_((~eos_in_sents).long())
+            # stop when there is a </s> in each sentence, or if we exceed the maximul length
+            if unfinished_sents.max() == 0:
+                break
+            # extend attention_mask for new generated input if only decoder
+            if self.config.is_encoder_decoder is False:
+                attention_mask = torch.cat(
+                    [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1
+                )
+        return input_ids
+    def _generate_beam_search(
+        self,
+        input_ids,
+        cur_len,
+        max_length,
+        min_length,
+        do_sample,
+        early_stopping,
+        temperature,
+        top_k,
+        top_p,
+        repetition_penalty,
+        no_repeat_ngram_size,
+        bad_words_ids,
+        pad_token_id,
+        eos_token_id,
+        batch_size,
+        num_return_sequences,
+        length_penalty,
+        num_beams,
+        vocab_size,
+        encoder_outputs,
+        attention_mask,
+        use_cache,
+        model_specific_kwargs,
+    ):
+        """ Generate sequences for each example with beam search.
+        """
+        # generated hypotheses
+        generated_hyps = [
+            BeamHypotheses(num_beams, max_length, length_penalty, early_stopping=early_stopping)
+            for _ in range(batch_size)
+        ]
+        # scores for each sentence in the beam
+        beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=input_ids.device)
+        # for greedy decoding it is made sure that only tokens of the first beam are considered to avoid sampling the exact same tokens three times
+        if do_sample is False:
+            beam_scores[:, 1:] = -1e9
+        beam_scores = beam_scores.view(-1)  # shape (batch_size * num_beams,)
+        # cache compute states
+        past = (encoder_outputs, None) if encoder_outputs is not None else None
+        # done sentences
+        done = [False for _ in range(batch_size)]
+        while cur_len < max_length:
+            model_inputs = self.prepare_inputs_for_generation(
+                input_ids, past=past, attention_mask=attention_mask, use_cache=use_cache, **model_specific_kwargs
+            )
+            outputs = self(**model_inputs)  # (batch_size * num_beams, cur_len, vocab_size)
+            next_token_logits = outputs[0][:, -1, :]  # (batch_size * num_beams, vocab_size)
+            # if model has past, then set the past variable to speed up decoding
+            if self._use_cache(outputs, use_cache):
+                past = outputs[1]
+            if self.config.is_encoder_decoder and do_sample is False:
+                # TODO (PVP) still a bit hacky here - there might be a better solution
+                next_token_logits = self.adjust_logits_during_generation(
+                    next_token_logits, cur_len=cur_len, max_length=max_length
+                )
+            scores = F.log_softmax(next_token_logits, dim=-1)  # (batch_size * num_beams, vocab_size)
+            scores = self.postprocess_next_token_scores(
+                scores=scores,
+                input_ids=input_ids,
+                no_repeat_ngram_size=no_repeat_ngram_size,
+                bad_words_ids=bad_words_ids,
+                cur_len=cur_len,
+                min_length=min_length,
+                max_length=max_length,
+                eos_token_id=eos_token_id,
+                repetition_penalty=repetition_penalty,
+                batch_size=batch_size,
+                num_beams=num_beams,
+            )
+            assert scores.shape == (batch_size * num_beams, vocab_size), "Shapes of scores: {} != {}".format(
+                scores.shape, (batch_size * num_beams, vocab_size)
+            )
+            if do_sample:
+                _scores = scores + beam_scores[:, None].expand_as(scores)  # (batch_size * num_beams, vocab_size)
+                # Temperature
+                if temperature != 1.0:
+                    _scores = _scores / temperature
+                # Top-p/top-k filtering
+                _scores = top_k_top_p_filtering(
+                    _scores, top_k=top_k, top_p=top_p, min_tokens_to_keep=2
+                )  # (batch_size * num_beams, vocab_size)
+                # re-organize to group the beam together to sample from all beam_idxs
+                _scores = _scores.contiguous().view(
+                    batch_size, num_beams * vocab_size
+                )  # (batch_size, num_beams * vocab_size)
+                # Sample 2 next tokens for each beam (so we have some spare tokens and match output of greedy beam search)
+                probs = F.softmax(_scores, dim=-1)
+                next_tokens = torch.multinomial(probs, num_samples=2 * num_beams)  # (batch_size, num_beams * 2)
+                # Compute next scores
+                next_scores = torch.gather(_scores, -1, next_tokens)  # (batch_size, num_beams * 2)
+                # sort the sampled vector to make sure that the first num_beams samples are the best
+                next_scores, next_scores_indices = torch.sort(next_scores, descending=True, dim=1)
+                next_tokens = torch.gather(next_tokens, -1, next_scores_indices)  # (batch_size, num_beams * 2)
+            else:
+                next_scores = scores + beam_scores[:, None].expand_as(scores)  # (batch_size * num_beams, vocab_size)
+                # re-organize to group the beam together (we are keeping top hypothesis accross beams)
+                next_scores = next_scores.view(
+                    batch_size, num_beams * vocab_size
+                )  # (batch_size, num_beams * vocab_size)
+                next_scores, next_tokens = torch.topk(next_scores, 2 * num_beams, dim=1, largest=True, sorted=True)
+            assert next_scores.size() == next_tokens.size() == (batch_size, 2 * num_beams)
+            # next batch beam content
+            next_batch_beam = []
+            # for each sentence
+            for batch_idx in range(batch_size):
+                # if we are done with this sentence, add a pad token
+                if done[batch_idx]:
+                    assert (
+                        len(generated_hyps[batch_idx]) >= num_beams
+                    ), "Batch can only be done if at least {} beams have been generated".format(num_beams)
+                    assert (
+                        eos_token_id is not None and pad_token_id is not None
+                    ), "generated beams >= num_beams -> eos_token_id and pad_token have to be defined"
+                    next_batch_beam.extend([(0, pad_token_id, 0)] * num_beams)  # pad the batch
+                    continue
+                # next sentence beam content, this will get added to next_batch_beam
+                next_sent_beam = []
+                # next tokens for this sentence
+                for beam_token_rank, (beam_token_id, beam_token_score) in enumerate(
+                    zip(next_tokens[batch_idx], next_scores[batch_idx])
+                ):
+                    # get beam and token IDs
+                    beam_id = beam_token_id // vocab_size
+                    token_id = beam_token_id % vocab_size
+                    effective_beam_id = batch_idx * num_beams + beam_id
+                    # add to generated hypotheses if end of sentence
+                    if (eos_token_id is not None) and (token_id.item() == eos_token_id):
+                        # if beam_token does not belong to top num_beams tokens, it should not be added
+                        is_beam_token_worse_than_top_num_beams = beam_token_rank >= num_beams
+                        if is_beam_token_worse_than_top_num_beams:
+                            continue
+                        generated_hyps[batch_idx].add(
+                            input_ids[effective_beam_id].clone(), beam_token_score.item(),
+                        )
+                    else:
+                        # add next predicted token since it is not eos_token
+                        next_sent_beam.append((beam_token_score, token_id, effective_beam_id))
+                    # once the beam for next step is full, don't add more tokens to it.
+                    if len(next_sent_beam) == num_beams:
+                        break
+                # Check if we are done so that we can save a pad step if all(done)
+                done[batch_idx] = done[batch_idx] or generated_hyps[batch_idx].is_done(
+                    next_scores[batch_idx].max().item(), cur_len
+                )
+                # update next beam content
+                assert len(next_sent_beam) == num_beams, "Beam should always be full"
+                next_batch_beam.extend(next_sent_beam)
+                assert len(next_batch_beam) == num_beams * (batch_idx + 1), "We should have added num_beams each step"
+            # stop when we are done with each sentence
+            if all(done):
+                break
+            # sanity check / prepare next batch
+            assert len(next_batch_beam) == batch_size * num_beams
+            beam_scores = beam_scores.new([x[0] for x in next_batch_beam])
+            beam_tokens = input_ids.new([x[1] for x in next_batch_beam])
+            beam_idx = input_ids.new([x[2] for x in next_batch_beam])
+            # re-order batch and update current length
+            input_ids = input_ids[beam_idx, :]
+            input_ids = torch.cat([input_ids, beam_tokens.unsqueeze(1)], dim=-1)
+            cur_len = cur_len + 1
+            # re-order internal states
+            if past is not None:
+                past = self._reorder_cache(past, beam_idx)
+            # extend attention_mask for new generated input if only decoder
+            if self.config.is_encoder_decoder is False:
+                attention_mask = torch.cat(
+                    [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1
+                )
+        # finalize all open beam hypotheses and add to generated hypotheses
+        for batch_idx in range(batch_size):
+            if done[batch_idx]:
+                continue
+            # test that beam scores match previously calculated scores if not eos and batch_idx not done
+            if eos_token_id is not None and all(
+                (token_id % vocab_size).item() != eos_token_id for token_id in next_tokens[batch_idx]
+            ):
+                assert torch.all(
+                    next_scores[batch_idx, :num_beams] == beam_scores.view(batch_size, num_beams)[batch_idx]
+                ), "If batch_idx is not done, final next scores: {} have to equal to accumulated beam_scores: {}".format(
+                    next_scores[:, :num_beams][batch_idx], beam_scores.view(batch_size, num_beams)[batch_idx],
+                )
+            # need to add best num_beams hypotheses to generated hyps
+            for beam_id in range(num_beams):
+                effective_beam_id = batch_idx * num_beams + beam_id
+                final_score = beam_scores[effective_beam_id].item()
+                final_tokens = input_ids[effective_beam_id]
+                generated_hyps[batch_idx].add(final_tokens, final_score)
+        # depending on whether greedy generation is wanted or not define different output_batch_size and output_num_return_sequences_per_batch
+        output_batch_size = batch_size if do_sample else batch_size * num_return_sequences
+        output_num_return_sequences_per_batch = 1 if do_sample else num_return_sequences
+        # select the best hypotheses
+        sent_lengths = input_ids.new(output_batch_size)
+        best = []
+        # retrieve best hypotheses
+        for i, hypotheses in enumerate(generated_hyps):
+            sorted_hyps = sorted(hypotheses.beams, key=lambda x: x[0])
+            for j in range(output_num_return_sequences_per_batch):
+                effective_batch_idx = output_num_return_sequences_per_batch * i + j
+                best_hyp = sorted_hyps.pop()[1]
+                sent_lengths[effective_batch_idx] = len(best_hyp)
+                best.append(best_hyp)
+        # shorter batches are padded
+        if sent_lengths.min().item() != sent_lengths.max().item():
+            assert pad_token_id is not None, "`Pad_token_id` has to be defined"
+            sent_max_len = min(sent_lengths.max().item() + 1, max_length)
+            decoded = input_ids.new(output_batch_size, sent_max_len).fill_(pad_token_id)
+            # fill with hypothesis and eos_token_id if necessary
+            for i, hypo in enumerate(best):
+                decoded[i, : sent_lengths[i]] = hypo
+                if sent_lengths[i] < max_length:
+                    decoded[i, sent_lengths[i]] = eos_token_id
+        else:
+            # none of the hypotheses have an eos_token
+            assert (len(hypo) == max_length for hypo in best)
+            decoded = torch.stack(best).type(torch.long).to(next(self.parameters()).device)
+        return decoded
+    @staticmethod
+    def _reorder_cache(past: Tuple, beam_idx: Tensor) -> Tuple[Tensor]:
+        return tuple(layer_past.index_select(1, beam_idx) for layer_past in past)
+def calc_banned_ngram_tokens(prev_input_ids: Tensor, num_hypos: int, no_repeat_ngram_size: int, cur_len: int) -> None:
+    """Copied from fairseq for no_repeat_ngram in beam_search"""
+    if cur_len + 1 < no_repeat_ngram_size:
+        # return no banned tokens if we haven't generated no_repeat_ngram_size tokens yet
+        return [[] for _ in range(num_hypos)]
+    generated_ngrams = [{} for _ in range(num_hypos)]
+    for idx in range(num_hypos):
+        gen_tokens = prev_input_ids[idx].tolist()
+        generated_ngram = generated_ngrams[idx]
+        for ngram in zip(*[gen_tokens[i:] for i in range(no_repeat_ngram_size)]):
+            prev_ngram_tuple = tuple(ngram[:-1])
+            generated_ngram[prev_ngram_tuple] = generated_ngram.get(prev_ngram_tuple, []) + [ngram[-1]]
+    def _get_generated_ngrams(hypo_idx):
+        # Before decoding the next token, prevent decoding of ngrams that have already appeared
+        start_idx = cur_len + 1 - no_repeat_ngram_size
+        ngram_idx = tuple(prev_input_ids[hypo_idx, start_idx:cur_len].tolist())
+        return generated_ngrams[hypo_idx].get(ngram_idx, [])
+    banned_tokens = [_get_generated_ngrams(hypo_idx) for hypo_idx in range(num_hypos)]
+    return banned_tokens
+def calc_banned_bad_words_ids(prev_input_ids: Iterable[int], bad_words_ids: Iterable[int]) -> Iterable[int]:
+    banned_tokens = []
+    def _tokens_match(prev_tokens, tokens):
+        if len(tokens) == 0:
+            # if bad word tokens is just one token always ban it
+            return True
+        if len(tokens) > len(prev_input_ids):
+            # if bad word tokens are longer then prev input_ids they can't be equal
+            return False
+        if prev_tokens[-len(tokens) :] == tokens:
+            # if tokens match
+            return True
+        else:
+            return False
+    for prev_input_ids_slice in prev_input_ids:
+        banned_tokens_slice = []
+        for banned_token_seq in bad_words_ids:
+            assert len(banned_token_seq) > 0, "Banned words token sequences {} cannot have an empty list".format(
+                bad_words_ids
+            )
+            if _tokens_match(prev_input_ids_slice.tolist(), banned_token_seq[:-1]) is False:
+                # if tokens do not match continue
+                continue
+            banned_tokens_slice.append(banned_token_seq[-1])
+        banned_tokens.append(banned_tokens_slice)
+    return banned_tokens
+def top_k_top_p_filtering(
+    logits: Tensor,
+    top_k: int = 0,
+    top_p: float = 1.0,
+    filter_value: float = -float("Inf"),
+    min_tokens_to_keep: int = 1,
+) -> Tensor:
+    """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
+        Args:
+            logits: logits distribution shape (batch size, vocabulary size)
+            if top_k > 0: keep only top k tokens with highest probability (top-k filtering).
+            if top_p < 1.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
+                Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
+            Make sure we keep at least min_tokens_to_keep per batch example in the output
+        From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
+    """
+    if top_k > 0:
+        top_k = min(max(top_k, min_tokens_to_keep), logits.size(-1))  # Safety check
+        # Remove all tokens with a probability less than the last token of the top-k
+        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
+        logits[indices_to_remove] = filter_value
+    if top_p < 1.0:
+        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+        # Remove tokens with cumulative probability above the threshold (token with 0 are kept)
+        sorted_indices_to_remove = cumulative_probs > top_p
+        if min_tokens_to_keep > 1:
+            # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below)
+            sorted_indices_to_remove[..., :min_tokens_to_keep] = 0
+        # Shift the indices to the right to keep also the first token above the threshold
+        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+        sorted_indices_to_remove[..., 0] = 0
+        # scatter sorted tensors to original indexing
+        indices_to_remove = sorted_indices_to_remove.scatter(1, sorted_indices, sorted_indices_to_remove)
+        logits[indices_to_remove] = filter_value
+    return logits
+class BeamHypotheses(object):
+    def __init__(self, num_beams, max_length, length_penalty, early_stopping):
+        """
+        Initialize n-best list of hypotheses.
+        """
+        self.max_length = max_length - 1  # ignoring bos_token
+        self.length_penalty = length_penalty
+        self.early_stopping = early_stopping
+        self.num_beams = num_beams
+        self.beams = []
+        self.worst_score = 1e9
+    def __len__(self):
+        """
+        Number of hypotheses in the list.
+        """
+        return len(self.beams)
+    def add(self, hyp, sum_logprobs):
+        """
+        Add a new hypothesis to the list.
+        """
+        score = sum_logprobs / len(hyp) ** self.length_penalty
+        if len(self) < self.num_beams or score > self.worst_score:
+            self.beams.append((score, hyp))
+            if len(self) > self.num_beams:
+                sorted_scores = sorted([(s, idx) for idx, (s, _) in enumerate(self.beams)])
+                del self.beams[sorted_scores[0][1]]
+                self.worst_score = sorted_scores[1][0]
+            else:
+                self.worst_score = min(score, self.worst_score)
+    def is_done(self, best_sum_logprobs, cur_len):
+        """
+        If there are enough hypotheses and that none of the hypotheses being generated
+        can become better than the worst one in the heap, then we are done with this sentence.
+        """
+        if len(self) < self.num_beams:
+            return False
+        elif self.early_stopping:
+            return True
+        else:
+            cur_score = best_sum_logprobs / cur_len ** self.length_penalty
+            ret = self.worst_score >= cur_score
+            return ret

CGFormer/bert/modeling_bert.py ADDED Viewed

	@@ -0,0 +1,1569 @@

+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch BERT model. """
+import logging
+import math
+import os
+import warnings
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss, MSELoss
+from .activations import gelu, gelu_new, swish
+from .configuration_bert import BertConfig
+from .file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_callable
+from .modeling_utils import PreTrainedModel, find_pruneable_heads_and_indices, prune_linear_layer
+logger = logging.getLogger(__name__)
+_TOKENIZER_FOR_DOC = "BertTokenizer"
+BERT_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "bert-base-uncased",
+    "bert-large-uncased",
+    "bert-base-cased",
+    "bert-large-cased",
+    "bert-base-multilingual-uncased",
+    "bert-base-multilingual-cased",
+    "bert-base-chinese",
+    "bert-base-german-cased",
+    "bert-large-uncased-whole-word-masking",
+    "bert-large-cased-whole-word-masking",
+    "bert-large-uncased-whole-word-masking-finetuned-squad",
+    "bert-large-cased-whole-word-masking-finetuned-squad",
+    "bert-base-cased-finetuned-mrpc",
+    "bert-base-german-dbmdz-cased",
+    "bert-base-german-dbmdz-uncased",
+    "cl-tohoku/bert-base-japanese",
+    "cl-tohoku/bert-base-japanese-whole-word-masking",
+    "cl-tohoku/bert-base-japanese-char",
+    "cl-tohoku/bert-base-japanese-char-whole-word-masking",
+    "TurkuNLP/bert-base-finnish-cased-v1",
+    "TurkuNLP/bert-base-finnish-uncased-v1",
+    "wietsedv/bert-base-dutch-cased",
+    # See all BERT models at https://huggingface.co/models?filter=bert
+]
+def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
+    """ Load tf checkpoints in a pytorch model.
+    """
+    try:
+        import re
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        logger.error(
+            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions."
+        )
+        raise
+    tf_path = os.path.abspath(tf_checkpoint_path)
+    logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    arrays = []
+    for name, shape in init_vars:
+        logger.info("Loading TF weight {} with shape {}".format(name, shape))
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        arrays.append(array)
+    for name, array in zip(names, arrays):
+        name = name.split("/")
+        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+        # which are not required for using pretrained model
+        if any(
+            n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
+            for n in name
+        ):
+            logger.info("Skipping {}".format("/".join(name)))
+            continue
+        pointer = model
+        for m_name in name:
+            if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
+                scope_names = re.split(r"_(\d+)", m_name)
+            else:
+                scope_names = [m_name]
+            if scope_names[0] == "kernel" or scope_names[0] == "gamma":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
+                pointer = getattr(pointer, "bias")
+            elif scope_names[0] == "output_weights":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "squad":
+                pointer = getattr(pointer, "classifier")
+            else:
+                try:
+                    pointer = getattr(pointer, scope_names[0])
+                except AttributeError:
+                    logger.info("Skipping {}".format("/".join(name)))
+                    continue
+            if len(scope_names) >= 2:
+                num = int(scope_names[1])
+                pointer = pointer[num]
+        if m_name[-11:] == "_embeddings":
+            pointer = getattr(pointer, "weight")
+        elif m_name == "kernel":
+            array = np.transpose(array)
+        try:
+            assert pointer.shape == array.shape
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        logger.info("Initialize PyTorch weight {}".format(name))
+        pointer.data = torch.from_numpy(array)
+    return model
+def mish(x):
+    return x * torch.tanh(nn.functional.softplus(x))
+ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish, "gelu_new": gelu_new, "mish": mish}
+BertLayerNorm = torch.nn.LayerNorm
+class BertEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings.
+    """
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None):
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+        seq_length = input_shape[1]
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+        if position_ids is None:
+            position_ids = torch.arange(seq_length, dtype=torch.long, device=device)
+            position_ids = position_ids.unsqueeze(0).expand(input_shape)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        position_embeddings = self.position_embeddings(position_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+        embeddings = inputs_embeds + position_embeddings + token_type_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+class BertSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention "
+                "heads (%d)" % (config.hidden_size, config.num_attention_heads)
+            )
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        output_attentions=False,
+    ):
+        mixed_query_layer = self.query(hidden_states)
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        if encoder_hidden_states is not None:
+            mixed_key_layer = self.key(encoder_hidden_states)
+            mixed_value_layer = self.value(encoder_hidden_states)
+            attention_mask = encoder_attention_mask
+        else:
+            mixed_key_layer = self.key(hidden_states)
+            mixed_value_layer = self.value(hidden_states)
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+        key_layer = self.transpose_for_scores(mixed_key_layer)
+        value_layer = self.transpose_for_scores(mixed_value_layer)
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+        context_layer = torch.matmul(attention_probs, value_layer)
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+        return outputs
+class BertSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+class BertAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.self = BertSelfAttention(config)
+        self.output = BertSelfOutput(config)
+        self.pruned_heads = set()
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        output_attentions=False,
+    ):
+        self_outputs = self.self(
+            hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+class BertIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+class BertOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+class BertLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.attention = BertAttention(config)
+        self.is_decoder = config.is_decoder
+        if self.is_decoder:
+            self.crossattention = BertAttention(config)
+        self.intermediate = BertIntermediate(config)
+        self.output = BertOutput(config)
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        output_attentions=False,
+    ):
+        self_attention_outputs = self.attention(
+            hidden_states, attention_mask, head_mask, output_attentions=output_attentions,
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+        if self.is_decoder and encoder_hidden_states is not None:
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                output_attentions,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[1:]  # add cross attentions if we output attention weights
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        outputs = (layer_output,) + outputs
+        return outputs
+class BertEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)])
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        output_attentions=False,
+        output_hidden_states=False,
+    ):
+        all_hidden_states = ()
+        all_attentions = ()
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            if getattr(self.config, "gradient_checkpointing", False):
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs, output_attentions)
+                    return custom_forward
+                layer_outputs = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(layer_module),
+                    hidden_states,
+                    attention_mask,
+                    head_mask[i],
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states,
+                    attention_mask,
+                    head_mask[i],
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    output_attentions,
+                )
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+        # Add last layer
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+        outputs = (hidden_states,)
+        if output_hidden_states:
+            outputs = outputs + (all_hidden_states,)
+        if output_attentions:
+            outputs = outputs + (all_attentions,)
+        return outputs  # last-layer hidden state, (all hidden states), (all attentions)
+class BertPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+class BertPredictionHeadTransform(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+class BertLMPredictionHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.transform = BertPredictionHeadTransform(config)
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+class BertOnlyMLMHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = BertLMPredictionHead(config)
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+class BertOnlyNSPHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+    def forward(self, pooled_output):
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return seq_relationship_score
+class BertPreTrainingHeads(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = BertLMPredictionHead(config)
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+    def forward(self, sequence_output, pooled_output):
+        prediction_scores = self.predictions(sequence_output)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return prediction_scores, seq_relationship_score
+class BertPreTrainedModel(PreTrainedModel):
+    """ An abstract class to handle weights initialization and
+        a simple interface for downloading and loading pretrained models.
+    """
+    config_class = BertConfig
+    load_tf_weights = load_tf_weights_in_bert
+    base_model_prefix = "bert"
+    def _init_weights(self, module):
+        """ Initialize the weights """
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, BertLayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+BERT_START_DOCSTRING = r"""
+    This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`_ sub-class.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general
+    usage and behavior.
+    Parameters:
+        config (:class:`~transformers.BertConfig`): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+"""
+BERT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`):
+            Indices of input sequence tokens in the vocabulary.
+            Indices can be obtained using :class:`transformers.BertTokenizer`.
+            See :func:`transformers.PreTrainedTokenizer.encode` and
+            :func:`transformers.PreTrainedTokenizer.__call__` for details.
+            `What are input IDs? <../glossary.html#input-ids>`__
+        attention_mask (:obj:`torch.FloatTensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`):
+            Mask to avoid performing attention on padding token indices.
+            Mask values selected in ``[0, 1]``:
+            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
+            `What are attention masks? <../glossary.html#attention-mask>`__
+        token_type_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`):
+            Segment token indices to indicate first and second portions of the inputs.
+            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
+            corresponds to a `sentence B` token
+            `What are token type IDs? <../glossary.html#token-type-ids>`_
+        position_ids (:obj:`torch.LongTensor` of shape :obj:`{0}`, `optional`, defaults to :obj:`None`):
+            Indices of positions of each input sequence tokens in the position embeddings.
+            Selected in the range ``[0, config.max_position_embeddings - 1]``.
+            `What are position IDs? <../glossary.html#position-ids>`_
+        head_mask (:obj:`torch.FloatTensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`, defaults to :obj:`None`):
+            Mask to nullify selected heads of the self-attention modules.
+            Mask values selected in ``[0, 1]``:
+            :obj:`1` indicates the head is **not masked**, :obj:`0` indicates the head is **masked**.
+        inputs_embeds (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
+            Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+            than the model's internal embedding lookup matrix.
+        encoder_hidden_states  (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`, defaults to :obj:`None`):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+            if the model is configured as a decoder.
+        encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask
+            is used in the cross-attention if the model is configured as a decoder.
+            Mask values selected in ``[0, 1]``:
+            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
+        output_attentions (:obj:`bool`, `optional`, defaults to :obj:`None`):
+            If set to ``True``, the attentions tensors of all attention layers are returned. See ``attentions`` under returned tensors for more detail.
+"""
+@add_start_docstrings(
+    "The bare Bert Model transformer outputting raw hidden-states without any specific head on top.",
+    BERT_START_DOCSTRING,
+)
+class BertModel(BertPreTrainedModel):
+    """
+    The model can behave as an encoder (with only self-attention) as well
+    as a decoder, in which case a layer of cross-attention is added between
+    the self-attention layers, following the architecture described in `Attention is all you need`_ by Ashish Vaswani,
+    Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
+    To behave as an decoder the model needs to be initialized with the
+    :obj:`is_decoder` argument of the configuration set to :obj:`True`; an
+    :obj:`encoder_hidden_states` is expected as an input to the forward pass.
+    .. _`Attention is all you need`:
+        https://arxiv.org/abs/1706.03762
+    """
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self.embeddings = BertEmbeddings(config)
+        self.encoder = BertEncoder(config)
+        self.pooler = BertPooler(config)
+        self.init_weights()
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+    def _prune_heads(self, heads_to_prune):
+        """ Prunes heads of the model.
+            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+            See base class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased")
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        output_attentions=None,
+        output_hidden_states=None,
+    ):
+        r"""
+    Return:
+        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
+        last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        pooler_output (:obj:`torch.FloatTensor`: of shape :obj:`(batch_size, hidden_size)`):
+            Last layer hidden-state of the first token of the sequence (classification token)
+            further processed by a Linear layer and a Tanh activation function. The Linear
+            layer weights are trained from the next sentence prediction (classification)
+            objective during pre-training.
+            This output is usually *not* a good summary
+            of the semantic content of the input, you're often better with averaging or pooling
+            the sequence of hidden-states for the whole input sequence.
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+        if attention_mask is None:
+            attention_mask = torch.ones(input_shape, device=device)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device)
+        # If a 2D ou 3D attention mask is provided for the cross-attention
+        # we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+        embedding_output = self.embeddings(
+            input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+        outputs = (sequence_output, pooled_output,) + encoder_outputs[
+            1:
+        ]  # add hidden_states and attentions if they are here
+        return outputs  # sequence_output, pooled_output, (hidden_states), (attentions)
+@add_start_docstrings(
+    """Bert Model with two heads on top as done during the pre-training: a `masked language modeling` head and
+    a `next sentence prediction (classification)` head. """,
+    BERT_START_DOCSTRING,
+)
+class BertForPreTraining(BertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.bert = BertModel(config)
+        self.cls = BertPreTrainingHeads(config)
+        self.init_weights()
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        next_sentence_label=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        **kwargs
+    ):
+        r"""
+        labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`):
+            Labels for computing the masked language modeling loss.
+            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
+            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
+            in ``[0, ..., config.vocab_size]``
+        next_sentence_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`):
+            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see :obj:`input_ids` docstring)
+            Indices should be in ``[0, 1]``.
+            ``0`` indicates sequence B is a continuation of sequence A,
+            ``1`` indicates sequence B is a random sequence.
+        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
+            Used to hide legacy arguments that have been deprecated.
+    Returns:
+        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
+        loss (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Total loss as the sum of the masked language modeling loss and the next sequence prediction (classification) loss.
+        prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        seq_relationship_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`):
+            Prediction scores of the next sequence prediction (classification) head (scores of True/False
+            continuation before SoftMax).
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    Examples::
+        >>> from transformers import BertTokenizer, BertForPreTraining
+        >>> import torch
+        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        >>> model = BertForPreTraining.from_pretrained('bert-base-uncased')
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> prediction_scores, seq_relationship_scores = outputs[:2]
+        """
+        if "masked_lm_labels" in kwargs:
+            warnings.warn(
+                "The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
+                DeprecationWarning,
+            )
+            labels = kwargs.pop("masked_lm_labels")
+        assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+        sequence_output, pooled_output = outputs[:2]
+        prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
+        outputs = (prediction_scores, seq_relationship_score,) + outputs[
+            2:
+        ]  # add hidden states and attention if they are here
+        if labels is not None and next_sentence_label is not None:
+            loss_fct = CrossEntropyLoss()
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
+            total_loss = masked_lm_loss + next_sentence_loss
+            outputs = (total_loss,) + outputs
+        return outputs  # (loss), prediction_scores, seq_relationship_score, (hidden_states), (attentions)
+@add_start_docstrings(
+    """Bert Model with a `language modeling` head on top for CLM fine-tuning. """, BERT_START_DOCSTRING
+)
+class BertLMHeadModel(BertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        assert config.is_decoder, "If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True`."
+        self.bert = BertModel(config)
+        self.cls = BertOnlyMLMHead(config)
+        self.init_weights()
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        **kwargs
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+            Labels for computing the left-to-right language modeling loss (next word prediction).
+            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
+            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
+            in ``[0, ..., config.vocab_size]``
+        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
+            Used to hide legacy arguments that have been deprecated.
+    Returns:
+        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
+        ltr_lm_loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
+            Next token prediction loss.
+        prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    Example::
+        >>> from transformers import BertTokenizer, BertLMHeadModel, BertConfig
+        >>> import torch
+        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
+        >>> config = BertConfig.from_pretrained("bert-base-cased")
+        >>> config.is_decoder = True
+        >>> model = BertLMHeadModel.from_pretrained('bert-base-cased', config=config)
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+        """
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+        outputs = (prediction_scores,) + outputs[2:]  # Add hidden states and attention if they are here
+        if labels is not None:
+            # we are doing next-token prediction; shift prediction scores and input ids by one
+            prediction_scores = prediction_scores[:, :-1, :].contiguous()
+            labels = labels[:, 1:].contiguous()
+            loss_fct = CrossEntropyLoss()
+            ltr_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+            outputs = (ltr_lm_loss,) + outputs
+        return outputs  # (ltr_lm_loss), prediction_scores, (hidden_states), (attentions)
+    def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_kwargs):
+        input_shape = input_ids.shape
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        if attention_mask is None:
+            attention_mask = input_ids.new_ones(input_shape)
+        return {"input_ids": input_ids, "attention_mask": attention_mask}
+@add_start_docstrings("""Bert Model with a `language modeling` head on top. """, BERT_START_DOCSTRING)
+class BertForMaskedLM(BertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        assert (
+            not config.is_decoder
+        ), "If you want to use `BertForMaskedLM` make sure `config.is_decoder=False` for bi-directional self-attention."
+        self.bert = BertModel(config)
+        self.cls = BertOnlyMLMHead(config)
+        self.init_weights()
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased")
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        **kwargs
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+            Labels for computing the masked language modeling loss.
+            Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
+            Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
+            in ``[0, ..., config.vocab_size]``
+        kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
+            Used to hide legacy arguments that have been deprecated.
+    Returns:
+        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
+        masked_lm_loss (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Masked language modeling loss.
+        prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        """
+        if "masked_lm_labels" in kwargs:
+            warnings.warn(
+                "The `masked_lm_labels` argument is deprecated and will be removed in a future version, use `labels` instead.",
+                DeprecationWarning,
+            )
+            labels = kwargs.pop("masked_lm_labels")
+        assert "lm_labels" not in kwargs, "Use `BertWithLMHead` for autoregressive language modeling task."
+        assert kwargs == {}, f"Unexpected keyword arguments: {list(kwargs.keys())}."
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+        outputs = (prediction_scores,) + outputs[2:]  # Add hidden states and attention if they are here
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()  # -100 index = padding token
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+            outputs = (masked_lm_loss,) + outputs
+        return outputs  # (masked_lm_loss), prediction_scores, (hidden_states), (attentions)
+    def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_kwargs):
+        input_shape = input_ids.shape
+        effective_batch_size = input_shape[0]
+        #  add a dummy token
+        assert self.config.pad_token_id is not None, "The PAD token should be defined for generation"
+        attention_mask = torch.cat([attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))], dim=-1)
+        dummy_token = torch.full(
+            (effective_batch_size, 1), self.config.pad_token_id, dtype=torch.long, device=input_ids.device
+        )
+        input_ids = torch.cat([input_ids, dummy_token], dim=1)
+        return {"input_ids": input_ids, "attention_mask": attention_mask}
+@add_start_docstrings(
+    """Bert Model with a `next sentence prediction (classification)` head on top. """, BERT_START_DOCSTRING,
+)
+class BertForNextSentencePrediction(BertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.bert = BertModel(config)
+        self.cls = BertOnlyNSPHead(config)
+        self.init_weights()
+    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        next_sentence_label=None,
+        output_attentions=None,
+        output_hidden_states=None,
+    ):
+        r"""
+        next_sentence_label (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see ``input_ids`` docstring)
+            Indices should be in ``[0, 1]``.
+            ``0`` indicates sequence B is a continuation of sequence A,
+            ``1`` indicates sequence B is a random sequence.
+    Returns:
+        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
+        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`next_sentence_label` is provided):
+            Next sequence prediction (classification) loss.
+        seq_relationship_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`):
+            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation before SoftMax).
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    Examples::
+        >>> from transformers import BertTokenizer, BertForNextSentencePrediction
+        >>> import torch
+        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        >>> model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')
+        >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+        >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
+        >>> encoding = tokenizer(prompt, next_sentence, return_tensors='pt')
+        >>> loss, logits = model(**encoding, next_sentence_label=torch.LongTensor([1]))
+        >>> assert logits[0, 0] < logits[0, 1] # next sentence was random
+        """
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+        pooled_output = outputs[1]
+        seq_relationship_score = self.cls(pooled_output)
+        outputs = (seq_relationship_score,) + outputs[2:]  # add hidden states and attention if they are here
+        if next_sentence_label is not None:
+            loss_fct = CrossEntropyLoss()
+            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
+            outputs = (next_sentence_loss,) + outputs
+        return outputs  # (next_sentence_loss), seq_relationship_score, (hidden_states), (attentions)
+@add_start_docstrings(
+    """Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of
+    the pooled output) e.g. for GLUE tasks. """,
+    BERT_START_DOCSTRING,
+)
+class BertForSequenceClassification(BertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.bert = BertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        self.init_weights()
+    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased")
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+            Labels for computing the sequence classification/regression loss.
+            Indices should be in :obj:`[0, ..., config.num_labels - 1]`.
+            If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+            If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+    Returns:
+        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
+        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`label` is provided):
+            Classification (or regression if config.num_labels==1) loss.
+        logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, config.num_labels)`):
+            Classification (or regression if config.num_labels==1) scores (before SoftMax).
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        """
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+        pooled_output = outputs[1]
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
+        if labels is not None:
+            if self.num_labels == 1:
+                #  We are doing regression
+                loss_fct = MSELoss()
+                loss = loss_fct(logits.view(-1), labels.view(-1))
+            else:
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            outputs = (loss,) + outputs
+        return outputs  # (loss), logits, (hidden_states), (attentions)
+@add_start_docstrings(
+    """Bert Model with a multiple choice classification head on top (a linear layer on top of
+    the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
+    BERT_START_DOCSTRING,
+)
+class BertForMultipleChoice(BertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.bert = BertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+        self.init_weights()
+    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, num_choices, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased")
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+            Labels for computing the multiple choice classification loss.
+            Indices should be in ``[0, ..., num_choices-1]`` where `num_choices` is the size of the second dimension
+            of the input tensors. (see `input_ids` above)
+    Returns:
+        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
+        loss (:obj:`torch.FloatTensor` of shape `(1,)`, `optional`, returned when :obj:`labels` is provided):
+            Classification loss.
+        classification_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, num_choices)`):
+            `num_choices` is the second dimension of the input tensors. (see `input_ids` above).
+            Classification scores (before SoftMax).
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        """
+        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
+        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+        position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
+        inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
+            if inputs_embeds is not None
+            else None
+        )
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+        pooled_output = outputs[1]
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = logits.view(-1, num_choices)
+        outputs = (reshaped_logits,) + outputs[2:]  # add hidden states and attention if they are here
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+            outputs = (loss,) + outputs
+        return outputs  # (loss), reshaped_logits, (hidden_states), (attentions)
+@add_start_docstrings(
+    """Bert Model with a token classification head on top (a linear layer on top of
+    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
+    BERT_START_DOCSTRING,
+)
+class BertForTokenClassification(BertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.bert = BertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        self.init_weights()
+    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased")
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        output_attentions=None,
+        output_hidden_states=None,
+    ):
+        r"""
+        labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
+            Labels for computing the token classification loss.
+            Indices should be in ``[0, ..., config.num_labels - 1]``.
+    Returns:
+        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
+        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when ``labels`` is provided) :
+            Classification loss.
+        scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.num_labels)`)
+            Classification scores (before SoftMax).
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        """
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+        sequence_output = outputs[0]
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            # Only keep active parts of the loss
+            if attention_mask is not None:
+                active_loss = attention_mask.view(-1) == 1
+                active_logits = logits.view(-1, self.num_labels)
+                active_labels = torch.where(
+                    active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
+                )
+                loss = loss_fct(active_logits, active_labels)
+            else:
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            outputs = (loss,) + outputs
+        return outputs  # (loss), scores, (hidden_states), (attentions)
+@add_start_docstrings(
+    """Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layers on top of the hidden-states output to compute `span start logits` and `span end logits`). """,
+    BERT_START_DOCSTRING,
+)
+class BertForQuestionAnswering(BertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.bert = BertModel(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+        self.init_weights()
+    @add_start_docstrings_to_callable(BERT_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(tokenizer_class=_TOKENIZER_FOR_DOC, checkpoint="bert-base-uncased")
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        start_positions=None,
+        end_positions=None,
+        output_attentions=None,
+        output_hidden_states=None,
+    ):
+        r"""
+        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`).
+            Position outside of the sequence are not taken into account for computing the loss.
+        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`, defaults to :obj:`None`):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`).
+            Position outside of the sequence are not taken into account for computing the loss.
+    Returns:
+        :obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
+        loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided):
+            Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
+        start_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
+            Span-start scores (before SoftMax).
+        end_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length,)`):
+            Span-end scores (before SoftMax).
+        hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
+            of shape :obj:`(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+            Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
+            :obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        """
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+        sequence_output = outputs[0]
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1)
+        end_logits = end_logits.squeeze(-1)
+        outputs = (start_logits, end_logits,) + outputs[2:]
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions.clamp_(0, ignored_index)
+            end_positions.clamp_(0, ignored_index)
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+            outputs = (total_loss,) + outputs
+        return outputs  # (loss), start_logits, end_logits, (hidden_states), (attentions)

CGFormer/bert/modeling_utils.py ADDED Viewed

	@@ -0,0 +1,1268 @@

+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors, Facebook AI Research authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+import logging
+import os
+from typing import Callable, Dict, List, Optional, Tuple
+import torch
+from torch import Tensor, device, dtype, nn
+from torch.nn import CrossEntropyLoss
+from torch.nn import functional as F
+from .activations import get_activation
+from .configuration_utils import PretrainedConfig
+from .file_utils import (
+    DUMMY_INPUTS,
+    TF2_WEIGHTS_NAME,
+    TF_WEIGHTS_NAME,
+    WEIGHTS_NAME,
+    cached_path,
+    hf_bucket_url,
+    is_remote_url,
+)
+from .generation_utils import GenerationMixin
+logger = logging.getLogger(__name__)
+try:
+    from torch.nn import Identity
+except ImportError:
+    # Older PyTorch compatibility
+    class Identity(nn.Module):
+        r"""A placeholder identity operator that is argument-insensitive.
+        """
+        def __init__(self, *args, **kwargs):
+            super().__init__()
+        def forward(self, input):
+            return input
+def find_pruneable_heads_and_indices(
+    heads: List, n_heads: int, head_size: int, already_pruned_heads: set
+) -> Tuple[set, "torch.LongTensor"]:
+    mask = torch.ones(n_heads, head_size)
+    heads = set(heads) - already_pruned_heads  # Convert to set and remove already pruned heads
+    for head in heads:
+        # Compute how many pruned heads are before the head and move the index accordingly
+        head = head - sum(1 if h < head else 0 for h in already_pruned_heads)
+        mask[head] = 0
+    mask = mask.view(-1).contiguous().eq(1)
+    index: torch.LongTensor = torch.arange(len(mask))[mask].long()
+    return heads, index
+class ModuleUtilsMixin:
+    """
+    A few utilities for torch.nn.Modules, to be used as a mixin.
+    """
+    def num_parameters(self, only_trainable: bool = False) -> int:
+        """
+        Get number of (optionally, trainable) parameters in the module.
+        """
+        params = filter(lambda x: x.requires_grad, self.parameters()) if only_trainable else self.parameters()
+        return sum(p.numel() for p in params)
+    @staticmethod
+    def _hook_rss_memory_pre_forward(module, *args, **kwargs):
+        try:
+            import psutil
+        except (ImportError):
+            raise ImportError("You need to install psutil (pip install psutil) to use memory tracing.")
+        process = psutil.Process(os.getpid())
+        mem = process.memory_info()
+        module.mem_rss_pre_forward = mem.rss
+        return None
+    @staticmethod
+    def _hook_rss_memory_post_forward(module, *args, **kwargs):
+        try:
+            import psutil
+        except (ImportError):
+            raise ImportError("You need to install psutil (pip install psutil) to use memory tracing.")
+        process = psutil.Process(os.getpid())
+        mem = process.memory_info()
+        module.mem_rss_post_forward = mem.rss
+        mem_rss_diff = module.mem_rss_post_forward - module.mem_rss_pre_forward
+        module.mem_rss_diff = mem_rss_diff + (module.mem_rss_diff if hasattr(module, "mem_rss_diff") else 0)
+        return None
+    def add_memory_hooks(self):
+        """ Add a memory hook before and after each sub-module forward pass to record increase in memory consumption.
+            Increase in memory consumption is stored in a `mem_rss_diff` attribute for each module and can be reset to zero with `model.reset_memory_hooks_state()`
+        """
+        for module in self.modules():
+            module.register_forward_pre_hook(self._hook_rss_memory_pre_forward)
+            module.register_forward_hook(self._hook_rss_memory_post_forward)
+        self.reset_memory_hooks_state()
+    def reset_memory_hooks_state(self):
+        for module in self.modules():
+            module.mem_rss_diff = 0
+            module.mem_rss_post_forward = 0
+            module.mem_rss_pre_forward = 0
+    @property
+    def device(self) -> device:
+        """
+        Get torch.device from module, assuming that the whole module has one device.
+        """
+        try:
+            return next(self.parameters()).device
+        except StopIteration:
+            # For nn.DataParallel compatibility in PyTorch 1.5
+            def find_tensor_attributes(module: nn.Module) -> List[Tuple[str, Tensor]]:
+                tuples = [(k, v) for k, v in module.__dict__.items() if torch.is_tensor(v)]
+                return tuples
+            gen = self._named_members(get_members_fn=find_tensor_attributes)
+            first_tuple = next(gen)
+            return first_tuple[1].device
+    @property
+    def dtype(self) -> dtype:
+        """
+        Get torch.dtype from module, assuming that the whole module has one dtype.
+        """
+        try:
+            return next(self.parameters()).dtype
+        except StopIteration:
+            # For nn.DataParallel compatibility in PyTorch 1.5
+            def find_tensor_attributes(module: nn.Module) -> List[Tuple[str, Tensor]]:
+                tuples = [(k, v) for k, v in module.__dict__.items() if torch.is_tensor(v)]
+                return tuples
+            gen = self._named_members(get_members_fn=find_tensor_attributes)
+            first_tuple = next(gen)
+            return first_tuple[1].dtype
+    def invert_attention_mask(self, encoder_attention_mask: Tensor) -> Tensor:
+        """type: torch.Tensor -> torch.Tensor"""
+        if encoder_attention_mask.dim() == 3:
+            encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
+        if encoder_attention_mask.dim() == 2:
+            encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
+        # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition
+        # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow
+        # /transformer/transformer_layers.py#L270
+        # encoder_extended_attention_mask = (encoder_extended_attention_mask ==
+        # encoder_extended_attention_mask.transpose(-1, -2))
+        encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=self.dtype)  # fp16 compatibility
+        if self.dtype == torch.float16:
+            encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1e4
+        elif self.dtype == torch.float32:
+            encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1e9
+        else:
+            raise ValueError(
+                "{} not recognized. `dtype` should be set to either `torch.float32` or `torch.float16`".format(
+                    self.dtype
+                )
+            )
+        return encoder_extended_attention_mask
+    def get_extended_attention_mask(self, attention_mask: Tensor, input_shape: Tuple, device: device) -> Tensor:
+        """Makes broadcastable attention mask and causal mask so that future and maked tokens are ignored.
+        Arguments:
+            attention_mask: torch.Tensor with 1 indicating tokens to ATTEND to
+            input_shape: tuple, shape of input_ids
+            device: torch.Device, usually self.device
+        Returns:
+            torch.Tensor with dtype of attention_mask.dtype
+        """
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        if attention_mask.dim() == 3:
+            extended_attention_mask = attention_mask[:, None, :, :]
+        elif attention_mask.dim() == 2:
+            # Provided a padding mask of dimensions [batch_size, seq_length]
+            # - if the model is a decoder, apply a causal mask in addition to the padding mask
+            # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
+            if self.config.is_decoder:
+                batch_size, seq_length = input_shape
+                seq_ids = torch.arange(seq_length, device=device)
+                causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None]
+                # causal and attention masks must have same type with pytorch version < 1.3
+                causal_mask = causal_mask.to(attention_mask.dtype)
+                extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
+            else:
+                extended_attention_mask = attention_mask[:, None, None, :]
+        else:
+            raise ValueError(
+                "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format(
+                    input_shape, attention_mask.shape
+                )
+            )
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = extended_attention_mask.to(dtype=self.dtype)  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+        return extended_attention_mask
+    def get_head_mask(self, head_mask: Tensor, num_hidden_layers: int, is_attention_chunked: bool = False) -> Tensor:
+        """
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        attention_probs has shape bsz x n_heads x N x N
+        Arguments:
+            head_mask: torch.Tensor or None: has shape [num_heads] or [num_hidden_layers x num_heads]
+            num_hidden_layers: int
+        Returns:
+             Tensor of shape shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+             or list with [None] for each layer
+        """
+        if head_mask is not None:
+            head_mask = self._convert_head_mask_to_5d(head_mask, num_hidden_layers)
+            if is_attention_chunked is True:
+                head_mask = head_mask.unsqueeze(-1)
+        else:
+            head_mask = [None] * num_hidden_layers
+        return head_mask
+    def _convert_head_mask_to_5d(self, head_mask, num_hidden_layers):
+        """-> [num_hidden_layers x batch x num_heads x seq_length x seq_length]"""
+        if head_mask.dim() == 1:
+            head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
+            head_mask = head_mask.expand(num_hidden_layers, -1, -1, -1, -1)
+        elif head_mask.dim() == 2:
+            head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
+        assert head_mask.dim() == 5, f"head_mask.dim != 5, instead {head_mask.dim()}"
+        head_mask = head_mask.to(dtype=self.dtype)  # switch to fload if need + fp16 compatibility
+        return head_mask
+class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin):
+    r""" Base class for all models.
+        :class:`~transformers.PreTrainedModel` takes care of storing the configuration of the models and handles methods for loading/downloading/saving models
+        as well as a few methods common to all models to (i) resize the input embeddings and (ii) prune heads in the self-attention heads.
+        Class attributes (overridden by derived classes):
+            - ``config_class``: a class derived from :class:`~transformers.PretrainedConfig` to use as configuration class for this model architecture.
+            - ``load_tf_weights``: a python ``method`` for loading a TensorFlow checkpoint in a PyTorch model, taking as arguments:
+                - ``model``: an instance of the relevant subclass of :class:`~transformers.PreTrainedModel`,
+                - ``config``: an instance of the relevant subclass of :class:`~transformers.PretrainedConfig`,
+                - ``path``: a path (string) to the TensorFlow checkpoint.
+            - ``base_model_prefix``: a string indicating the attribute associated to the base model in derived classes of the same architecture adding modules on top of the base model.
+    """
+    config_class = None
+    base_model_prefix = ""
+    @property
+    def dummy_inputs(self):
+        """ Dummy inputs to do a forward pass in the network.
+        Returns:
+            torch.Tensor with dummy inputs
+        """
+        return {"input_ids": torch.tensor(DUMMY_INPUTS)}
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__()
+        if not isinstance(config, PretrainedConfig):
+            raise ValueError(
+                "Parameter config in `{}(config)` should be an instance of class `PretrainedConfig`. "
+                "To create a model from a pretrained model use "
+                "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
+                    self.__class__.__name__, self.__class__.__name__
+                )
+            )
+        # Save config in model
+        self.config = config
+    @property
+    def base_model(self):
+        return getattr(self, self.base_model_prefix, self)
+    def get_input_embeddings(self):
+        """
+        Returns the model's input embeddings.
+        Returns:
+            :obj:`nn.Module`:
+                A torch module mapping vocabulary to hidden states.
+        """
+        base_model = getattr(self, self.base_model_prefix, self)
+        if base_model is not self:
+            return base_model.get_input_embeddings()
+        else:
+            raise NotImplementedError
+    def set_input_embeddings(self, value: nn.Module):
+        """
+        Set model's input embeddings
+        Args:
+            value (:obj:`nn.Module`):
+                A module mapping vocabulary to hidden states.
+        """
+        base_model = getattr(self, self.base_model_prefix, self)
+        if base_model is not self:
+            base_model.set_input_embeddings(value)
+        else:
+            raise NotImplementedError
+    def get_output_embeddings(self):
+        """
+        Returns the model's output embeddings.
+        Returns:
+            :obj:`nn.Module`:
+                A torch module mapping hidden states to vocabulary.
+        """
+        return None  # Overwrite for models with output embeddings
+    def tie_weights(self):
+        """
+        Tie the weights between the input embeddings and the output embeddings.
+        If the `torchscript` flag is set in the configuration, can't handle parameter sharing so we are cloning
+        the weights instead.
+        """
+        output_embeddings = self.get_output_embeddings()
+        if output_embeddings is not None:
+            self._tie_or_clone_weights(output_embeddings, self.get_input_embeddings())
+    def _tie_or_clone_weights(self, output_embeddings, input_embeddings):
+        """ Tie or clone module weights depending of whether we are using TorchScript or not
+        """
+        if self.config.torchscript:
+            output_embeddings.weight = nn.Parameter(input_embeddings.weight.clone())
+        else:
+            output_embeddings.weight = input_embeddings.weight
+        if getattr(output_embeddings, "bias", None) is not None:
+            output_embeddings.bias.data = torch.nn.functional.pad(
+                output_embeddings.bias.data,
+                (0, output_embeddings.weight.shape[0] - output_embeddings.bias.shape[0],),
+                "constant",
+                0,
+            )
+        if hasattr(output_embeddings, "out_features") and hasattr(input_embeddings, "num_embeddings"):
+            output_embeddings.out_features = input_embeddings.num_embeddings
+    def resize_token_embeddings(self, new_num_tokens: Optional[int] = None):
+        """ Resize input token embeddings matrix of the model if new_num_tokens != config.vocab_size.
+        Take care of tying weights embeddings afterwards if the model class has a `tie_weights()` method.
+        Arguments:
+            new_num_tokens: (`optional`) int:
+                New number of tokens in the embedding matrix. Increasing the size will add newly initialized vectors at the end. Reducing the size will remove vectors from the end.
+                If not provided or None: does nothing and just returns a pointer to the input tokens ``torch.nn.Embeddings`` Module of the model.
+        Return: ``torch.nn.Embeddings``
+            Pointer to the input tokens Embeddings Module of the model
+        """
+        base_model = getattr(self, self.base_model_prefix, self)  # get the base model if needed
+        model_embeds = base_model._resize_token_embeddings(new_num_tokens)
+        if new_num_tokens is None:
+            return model_embeds
+        # Update base model and current model config
+        self.config.vocab_size = new_num_tokens
+        base_model.vocab_size = new_num_tokens
+        # Tie weights again if needed
+        self.tie_weights()
+        return model_embeds
+    def _resize_token_embeddings(self, new_num_tokens):
+        old_embeddings = self.get_input_embeddings()
+        new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens)
+        self.set_input_embeddings(new_embeddings)
+        return self.get_input_embeddings()
+    def _get_resized_embeddings(
+        self, old_embeddings: torch.nn.Embedding, new_num_tokens: Optional[int] = None
+    ) -> torch.nn.Embedding:
+        """ Build a resized Embedding Module from a provided token Embedding Module.
+            Increasing the size will add newly initialized vectors at the end
+            Reducing the size will remove vectors from the end
+        Args:
+            old_embeddings: ``torch.nn.Embedding``
+                Old embeddings to be resized.
+            new_num_tokens: (`optional`) int
+                New number of tokens in the embedding matrix.
+                Increasing the size will add newly initialized vectors at the end
+                Reducing the size will remove vectors from the end
+                If not provided or None: return the provided token Embedding Module.
+        Return: ``torch.nn.Embedding``
+            Pointer to the resized Embedding Module or the old Embedding Module if new_num_tokens is None
+        """
+        if new_num_tokens is None:
+            return old_embeddings
+        old_num_tokens, old_embedding_dim = old_embeddings.weight.size()
+        if old_num_tokens == new_num_tokens:
+            return old_embeddings
+        # Build new embeddings
+        new_embeddings = nn.Embedding(new_num_tokens, old_embedding_dim)
+        new_embeddings.to(old_embeddings.weight.device)
+        # initialize all new embeddings (in particular added tokens)
+        self._init_weights(new_embeddings)
+        # Copy token embeddings from the previous weights
+        num_tokens_to_copy = min(old_num_tokens, new_num_tokens)
+        new_embeddings.weight.data[:num_tokens_to_copy, :] = old_embeddings.weight.data[:num_tokens_to_copy, :]
+        return new_embeddings
+    def init_weights(self):
+        """ Initialize and prunes weights if needed. """
+        # Initialize weights
+        self.apply(self._init_weights)
+        # Prune heads if needed
+        if self.config.pruned_heads:
+            self.prune_heads(self.config.pruned_heads)
+        # Tie weights if needed
+        self.tie_weights()
+    def prune_heads(self, heads_to_prune: Dict):
+        """ Prunes heads of the base model.
+            Arguments:
+                heads_to_prune: dict with keys being selected layer indices (`int`) and associated values being the list of heads to prune in said layer (list of `int`).
+                E.g. {1: [0, 2], 2: [2, 3]} will prune heads 0 and 2 on layer 1 and heads 2 and 3 on layer 2.
+        """
+        # save new sets of pruned heads as union of previously stored pruned heads and newly pruned heads
+        for layer, heads in heads_to_prune.items():
+            union_heads = set(self.config.pruned_heads.get(layer, [])) | set(heads)
+            self.config.pruned_heads[layer] = list(union_heads)  # Unfortunately we have to store it as list for JSON
+        self.base_model._prune_heads(heads_to_prune)
+    def save_pretrained(self, save_directory):
+        """ Save a model and its configuration file to a directory, so that it
+            can be re-loaded using the `:func:`~transformers.PreTrainedModel.from_pretrained`` class method.
+            Arguments:
+                save_directory: directory to which to save.
+        """
+        if os.path.isfile(save_directory):
+            logger.error("Provided path ({}) should be a directory, not a file".format(save_directory))
+            return
+        os.makedirs(save_directory, exist_ok=True)
+        # Only save the model itself if we are using distributed training
+        model_to_save = self.module if hasattr(self, "module") else self
+        # Attach architecture to the config
+        model_to_save.config.architectures = [model_to_save.__class__.__name__]
+        # If we save using the predefined names, we can load using `from_pretrained`
+        output_model_file = os.path.join(save_directory, WEIGHTS_NAME)
+        if getattr(self.config, "xla_device", False):
+            import torch_xla.core.xla_model as xm
+            if xm.is_master_ordinal():
+                # Save configuration file
+                model_to_save.config.save_pretrained(save_directory)
+            # xm.save takes care of saving only from master
+            xm.save(model_to_save.state_dict(), output_model_file)
+        else:
+            model_to_save.config.save_pretrained(save_directory)
+            torch.save(model_to_save.state_dict(), output_model_file)
+        logger.info("Model weights saved in {}".format(output_model_file))
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        r"""Instantiate a pretrained pytorch model from a pre-trained model configuration.
+        The model is set in evaluation mode by default using ``model.eval()`` (Dropout modules are deactivated)
+        To train the model, you should first set it back in training mode with ``model.train()``
+        The warning ``Weights from XXX not initialized from pretrained model`` means that the weights of XXX do not come pre-trained with the rest of the model.
+        It is up to you to train those weights with a downstream fine-tuning task.
+        The warning ``Weights from XXX not used in YYY`` means that the layer XXX is not used by YYY, therefore those weights are discarded.
+        Parameters:
+            pretrained_model_name_or_path: either:
+              - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+              - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
+              - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+              - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
+              - None if you are both providing the configuration and state dictionary (resp. with keyword arguments ``config`` and ``state_dict``)
+            model_args: (`optional`) Sequence of positional arguments:
+                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
+            config: (`optional`) one of:
+                - an instance of a class derived from :class:`~transformers.PretrainedConfig`, or
+                - a string valid as input to :func:`~transformers.PretrainedConfig.from_pretrained()`
+                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
+                    - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
+                    - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                    - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
+            state_dict: (`optional`) dict:
+                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
+                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
+                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
+            cache_dir: (`optional`) string:
+                Path to a directory in which a downloaded pre-trained model
+                configuration should be cached if the standard cache should not be used.
+            force_download: (`optional`) boolean, default False:
+                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
+            resume_download: (`optional`) boolean, default False:
+                Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
+            proxies: (`optional`) dict, default None:
+                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+                The proxies are used on each request.
+            output_loading_info: (`optional`) boolean:
+                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
+            kwargs: (`optional`) Remaining dictionary of keyword arguments:
+                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
+                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
+                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
+        Examples::
+            # For example purposes. Not runnable.
+            model = BertModel.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
+            model = BertModel.from_pretrained('./test/saved_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+            model = BertModel.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
+            assert model.config.output_attention == True
+            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
+            config = BertConfig.from_json_file('./tf_model/my_tf_model_config.json')
+            model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+        """
+        config = kwargs.pop("config", None)
+        state_dict = kwargs.pop("state_dict", None)
+        cache_dir = kwargs.pop("cache_dir", None)
+        from_tf = kwargs.pop("from_tf", False)
+        force_download = kwargs.pop("force_download", False)
+        resume_download = kwargs.pop("resume_download", False)
+        proxies = kwargs.pop("proxies", None)
+        output_loading_info = kwargs.pop("output_loading_info", False)
+        local_files_only = kwargs.pop("local_files_only", False)
+        use_cdn = kwargs.pop("use_cdn", True)
+        # Load config if we don't provide a configuration
+        if not isinstance(config, PretrainedConfig):
+            config_path = config if config is not None else pretrained_model_name_or_path
+            config, model_kwargs = cls.config_class.from_pretrained(
+                config_path,
+                *model_args,
+                cache_dir=cache_dir,
+                return_unused_kwargs=True,
+                force_download=force_download,
+                resume_download=resume_download,
+                proxies=proxies,
+                local_files_only=local_files_only,
+                **kwargs,
+            )
+        else:
+            model_kwargs = kwargs
+        # Load model
+        if pretrained_model_name_or_path is not None:
+            if os.path.isdir(pretrained_model_name_or_path):
+                if from_tf and os.path.isfile(os.path.join(pretrained_model_name_or_path, TF_WEIGHTS_NAME + ".index")):
+                    # Load from a TF 1.0 checkpoint
+                    archive_file = os.path.join(pretrained_model_name_or_path, TF_WEIGHTS_NAME + ".index")
+                elif from_tf and os.path.isfile(os.path.join(pretrained_model_name_or_path, TF2_WEIGHTS_NAME)):
+                    # Load from a TF 2.0 checkpoint
+                    archive_file = os.path.join(pretrained_model_name_or_path, TF2_WEIGHTS_NAME)
+                elif os.path.isfile(os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)):
+                    # Load from a PyTorch checkpoint
+                    archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
+                else:
+                    raise EnvironmentError(
+                        "Error no file named {} found in directory {} or `from_tf` set to False".format(
+                            [WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME + ".index"],
+                            pretrained_model_name_or_path,
+                        )
+                    )
+            elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
+                archive_file = pretrained_model_name_or_path
+            elif os.path.isfile(pretrained_model_name_or_path + ".index"):
+                assert (
+                    from_tf
+                ), "We found a TensorFlow checkpoint at {}, please set from_tf to True to load from this checkpoint".format(
+                    pretrained_model_name_or_path + ".index"
+                )
+                archive_file = pretrained_model_name_or_path + ".index"
+            else:
+                archive_file = hf_bucket_url(
+                    pretrained_model_name_or_path,
+                    filename=(TF2_WEIGHTS_NAME if from_tf else WEIGHTS_NAME),
+                    use_cdn=use_cdn,
+                )
+            try:
+                # Load from URL or cache if already cached
+                resolved_archive_file = cached_path(
+                    archive_file,
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    proxies=proxies,
+                    resume_download=resume_download,
+                    local_files_only=local_files_only,
+                )
+                if resolved_archive_file is None:
+                    raise EnvironmentError
+            except EnvironmentError:
+                msg = (
+                    f"Can't load weights for '{pretrained_model_name_or_path}'. Make sure that:\n\n"
+                    f"- '{pretrained_model_name_or_path}' is a correct model identifier listed on 'https://huggingface.co/models'\n\n"
+                    f"- or '{pretrained_model_name_or_path}' is the correct path to a directory containing a file named one of {WEIGHTS_NAME}, {TF2_WEIGHTS_NAME}, {TF_WEIGHTS_NAME}.\n\n"
+                )
+                raise EnvironmentError(msg)
+            if resolved_archive_file == archive_file:
+                logger.info("loading weights file {}".format(archive_file))
+            else:
+                logger.info("loading weights file {} from cache at {}".format(archive_file, resolved_archive_file))
+        else:
+            resolved_archive_file = None
+        # Instantiate model.
+        model = cls(config, *model_args, **model_kwargs)
+        if state_dict is None and not from_tf:
+            try:
+                state_dict = torch.load(resolved_archive_file, map_location="cpu")
+            except Exception:
+                raise OSError(
+                    "Unable to load weights from pytorch checkpoint file. "
+                    "If you tried to load a PyTorch model from a TF 2.0 checkpoint, please set from_tf=True. "
+                )
+        missing_keys = []
+        unexpected_keys = []
+        error_msgs = []
+        if from_tf:
+            if resolved_archive_file.endswith(".index"):
+                # Load from a TensorFlow 1.X checkpoint - provided by original authors
+                model = cls.load_tf_weights(model, config, resolved_archive_file[:-6])  # Remove the '.index'
+            else:
+                # Load from our TensorFlow 2.0 checkpoints
+                try:
+                    from transformers import load_tf2_checkpoint_in_pytorch_model
+                    model = load_tf2_checkpoint_in_pytorch_model(model, resolved_archive_file, allow_missing_keys=True)
+                except ImportError:
+                    logger.error(
+                        "Loading a TensorFlow model in PyTorch, requires both PyTorch and TensorFlow to be installed. Please see "
+                        "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions."
+                    )
+                    raise
+        else:
+            # Convert old format to new format if needed from a PyTorch state_dict
+            old_keys = []
+            new_keys = []
+            for key in state_dict.keys():
+                new_key = None
+                if "gamma" in key:
+                    new_key = key.replace("gamma", "weight")
+                if "beta" in key:
+                    new_key = key.replace("beta", "bias")
+                if new_key:
+                    old_keys.append(key)
+                    new_keys.append(new_key)
+            for old_key, new_key in zip(old_keys, new_keys):
+                state_dict[new_key] = state_dict.pop(old_key)
+            # copy state_dict so _load_from_state_dict can modify it
+            metadata = getattr(state_dict, "_metadata", None)
+            state_dict = state_dict.copy()
+            if metadata is not None:
+                state_dict._metadata = metadata
+            ##############################################################################################
+            # Print out state_dict's contents: keys
+            '''
+            for key, _ in state_dict.items():
+                print(key)
+            '''
+            ##############################################################################################
+            # PyTorch's `_load_from_state_dict` does not copy parameters in a module's descendants
+            # so we need to apply the function recursively.
+            def load(module: nn.Module, prefix=""):
+                local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
+                module._load_from_state_dict(
+                    state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs,
+                )
+                for name, child in module._modules.items():
+                    if child is not None:
+                        load(child, prefix + name + ".")
+            # Make sure we are able to load base models as well as derived models (with heads)
+            start_prefix = ""
+            model_to_load = model
+            has_prefix_module = any(s.startswith(cls.base_model_prefix) for s in state_dict.keys())
+            if not hasattr(model, cls.base_model_prefix) and has_prefix_module:
+                start_prefix = cls.base_model_prefix + "."
+            if hasattr(model, cls.base_model_prefix) and not has_prefix_module:
+                model_to_load = getattr(model, cls.base_model_prefix)
+            load(model_to_load, prefix=start_prefix)
+            if model.__class__.__name__ != model_to_load.__class__.__name__:
+                base_model_state_dict = model_to_load.state_dict().keys()
+                head_model_state_dict_without_base_prefix = [
+                    key.split(cls.base_model_prefix + ".")[-1] for key in model.state_dict().keys()
+                ]
+                missing_keys.extend(head_model_state_dict_without_base_prefix - base_model_state_dict)
+            if len(unexpected_keys) > 0:
+                logger.warning(
+                    f"Some weights of the model checkpoint at {pretrained_model_name_or_path} were not used when "
+                    f"initializing {model.__class__.__name__}: {unexpected_keys}\n"
+                    f"- This IS expected if you are initializing {model.__class__.__name__} from the checkpoint of a model trained on another task "
+                    f"or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).\n"
+                    f"- This IS NOT expected if you are initializing {model.__class__.__name__} from the checkpoint of a model that you expect "
+                    f"to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model)."
+                )
+            else:
+                logger.info(f"All model checkpoint weights were used when initializing {model.__class__.__name__}.\n")
+            if len(missing_keys) > 0:
+                logger.warning(
+                    f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at {pretrained_model_name_or_path} "
+                    f"and are newly initialized: {missing_keys}\n"
+                    f"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference."
+                )
+            else:
+                logger.info(
+                    f"All the weights of {model.__class__.__name__} were initialized from the model checkpoint at {pretrained_model_name_or_path}.\n"
+                    f"If your task is similar to the task the model of the ckeckpoint was trained on, "
+                    f"you can already use {model.__class__.__name__} for predictions without further training."
+                )
+            if len(error_msgs) > 0:
+                raise RuntimeError(
+                    "Error(s) in loading state_dict for {}:\n\t{}".format(
+                        model.__class__.__name__, "\n\t".join(error_msgs)
+                    )
+                )
+        model.tie_weights()  # make sure token embedding weights are still tied if needed
+        # Set model in evaluation mode to deactivate DropOut modules by default
+        model.eval()
+        if output_loading_info:
+            loading_info = {
+                "missing_keys": missing_keys,
+                "unexpected_keys": unexpected_keys,
+                "error_msgs": error_msgs,
+            }
+            return model, loading_info
+        if hasattr(config, "xla_device") and config.xla_device:
+            import torch_xla.core.xla_model as xm
+            model = xm.send_cpu_data_to_device(model, xm.xla_device())
+            model.to(xm.xla_device())
+        return model
+class Conv1D(nn.Module):
+    def __init__(self, nf, nx):
+        """ Conv1D layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2)
+            Basically works like a Linear layer but the weights are transposed
+        """
+        super().__init__()
+        self.nf = nf
+        w = torch.empty(nx, nf)
+        nn.init.normal_(w, std=0.02)
+        self.weight = nn.Parameter(w)
+        self.bias = nn.Parameter(torch.zeros(nf))
+    def forward(self, x):
+        size_out = x.size()[:-1] + (self.nf,)
+        x = torch.addmm(self.bias, x.view(-1, x.size(-1)), self.weight)
+        x = x.view(*size_out)
+        return x
+class PoolerStartLogits(nn.Module):
+    """ Compute SQuAD start_logits from sequence hidden states. """
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, 1)
+    def forward(self, hidden_states, p_mask=None):
+        """ Args:
+            **p_mask**: (`optional`) ``torch.FloatTensor`` of shape `(batch_size, seq_len)`
+                invalid position mask such as query and special symbols (PAD, SEP, CLS)
+                1.0 means token should be masked.
+        """
+        x = self.dense(hidden_states).squeeze(-1)
+        if p_mask is not None:
+            if next(self.parameters()).dtype == torch.float16:
+                x = x * (1 - p_mask) - 65500 * p_mask
+            else:
+                x = x * (1 - p_mask) - 1e30 * p_mask
+        return x
+class PoolerEndLogits(nn.Module):
+    """ Compute SQuAD end_logits from sequence hidden states and start token hidden state.
+    """
+    def __init__(self, config):
+        super().__init__()
+        self.dense_0 = nn.Linear(config.hidden_size * 2, config.hidden_size)
+        self.activation = nn.Tanh()
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dense_1 = nn.Linear(config.hidden_size, 1)
+    def forward(self, hidden_states, start_states=None, start_positions=None, p_mask=None):
+        """ Args:
+            One of ``start_states``, ``start_positions`` should be not None.
+            If both are set, ``start_positions`` overrides ``start_states``.
+            **start_states**: ``torch.LongTensor`` of shape identical to hidden_states
+                hidden states of the first tokens for the labeled span.
+            **start_positions**: ``torch.LongTensor`` of shape ``(batch_size,)``
+                position of the first token for the labeled span:
+            **p_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, seq_len)``
+                Mask of invalid position such as query and special symbols (PAD, SEP, CLS)
+                1.0 means token should be masked.
+        """
+        assert (
+            start_states is not None or start_positions is not None
+        ), "One of start_states, start_positions should be not None"
+        if start_positions is not None:
+            slen, hsz = hidden_states.shape[-2:]
+            start_positions = start_positions[:, None, None].expand(-1, -1, hsz)  # shape (bsz, 1, hsz)
+            start_states = hidden_states.gather(-2, start_positions)  # shape (bsz, 1, hsz)
+            start_states = start_states.expand(-1, slen, -1)  # shape (bsz, slen, hsz)
+        x = self.dense_0(torch.cat([hidden_states, start_states], dim=-1))
+        x = self.activation(x)
+        x = self.LayerNorm(x)
+        x = self.dense_1(x).squeeze(-1)
+        if p_mask is not None:
+            if next(self.parameters()).dtype == torch.float16:
+                x = x * (1 - p_mask) - 65500 * p_mask
+            else:
+                x = x * (1 - p_mask) - 1e30 * p_mask
+        return x
+class PoolerAnswerClass(nn.Module):
+    """ Compute SQuAD 2.0 answer class from classification and start tokens hidden states. """
+    def __init__(self, config):
+        super().__init__()
+        self.dense_0 = nn.Linear(config.hidden_size * 2, config.hidden_size)
+        self.activation = nn.Tanh()
+        self.dense_1 = nn.Linear(config.hidden_size, 1, bias=False)
+    def forward(self, hidden_states, start_states=None, start_positions=None, cls_index=None):
+        """
+        Args:
+            One of ``start_states``, ``start_positions`` should be not None.
+            If both are set, ``start_positions`` overrides ``start_states``.
+            **start_states**: ``torch.LongTensor`` of shape identical to ``hidden_states``.
+                hidden states of the first tokens for the labeled span.
+            **start_positions**: ``torch.LongTensor`` of shape ``(batch_size,)``
+                position of the first token for the labeled span.
+            **cls_index**: torch.LongTensor of shape ``(batch_size,)``
+                position of the CLS token. If None, take the last token.
+            note(Original repo):
+                no dependency on end_feature so that we can obtain one single `cls_logits`
+                for each sample
+        """
+        hsz = hidden_states.shape[-1]
+        assert (
+            start_states is not None or start_positions is not None
+        ), "One of start_states, start_positions should be not None"
+        if start_positions is not None:
+            start_positions = start_positions[:, None, None].expand(-1, -1, hsz)  # shape (bsz, 1, hsz)
+            start_states = hidden_states.gather(-2, start_positions).squeeze(-2)  # shape (bsz, hsz)
+        if cls_index is not None:
+            cls_index = cls_index[:, None, None].expand(-1, -1, hsz)  # shape (bsz, 1, hsz)
+            cls_token_state = hidden_states.gather(-2, cls_index).squeeze(-2)  # shape (bsz, hsz)
+        else:
+            cls_token_state = hidden_states[:, -1, :]  # shape (bsz, hsz)
+        x = self.dense_0(torch.cat([start_states, cls_token_state], dim=-1))
+        x = self.activation(x)
+        x = self.dense_1(x).squeeze(-1)
+        return x
+class SQuADHead(nn.Module):
+    r""" A SQuAD head inspired by XLNet.
+    Parameters:
+        config (:class:`~transformers.XLNetConfig`): Model configuration class with all the parameters of the model.
+    Inputs:
+        **hidden_states**: ``torch.FloatTensor`` of shape ``(batch_size, seq_len, hidden_size)``
+            hidden states of sequence tokens
+        **start_positions**: ``torch.LongTensor`` of shape ``(batch_size,)``
+            position of the first token for the labeled span.
+        **end_positions**: ``torch.LongTensor`` of shape ``(batch_size,)``
+            position of the last token for the labeled span.
+        **cls_index**: torch.LongTensor of shape ``(batch_size,)``
+            position of the CLS token. If None, take the last token.
+        **is_impossible**: ``torch.LongTensor`` of shape ``(batch_size,)``
+            Whether the question has a possible answer in the paragraph or not.
+        **p_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, seq_len)``
+            Mask of invalid position such as query and special symbols (PAD, SEP, CLS)
+            1.0 means token should be masked.
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned if both ``start_positions`` and ``end_positions`` are provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Classification loss as the sum of start token, end token (and is_impossible if provided) classification losses.
+        **start_top_log_probs**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
+            ``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top)``
+            Log probabilities for the top config.start_n_top start token possibilities (beam-search).
+        **start_top_index**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
+            ``torch.LongTensor`` of shape ``(batch_size, config.start_n_top)``
+            Indices for the top config.start_n_top start token possibilities (beam-search).
+        **end_top_log_probs**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
+            ``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``
+            Log probabilities for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
+        **end_top_index**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
+            ``torch.LongTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``
+            Indices for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
+        **cls_logits**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
+            ``torch.FloatTensor`` of shape ``(batch_size,)``
+            Log probabilities for the ``is_impossible`` label of the answers.
+    """
+    def __init__(self, config):
+        super().__init__()
+        self.start_n_top = config.start_n_top
+        self.end_n_top = config.end_n_top
+        self.start_logits = PoolerStartLogits(config)
+        self.end_logits = PoolerEndLogits(config)
+        self.answer_class = PoolerAnswerClass(config)
+    def forward(
+        self, hidden_states, start_positions=None, end_positions=None, cls_index=None, is_impossible=None, p_mask=None,
+    ):
+        outputs = ()
+        start_logits = self.start_logits(hidden_states, p_mask=p_mask)
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, let's remove the dimension added by batch splitting
+            for x in (start_positions, end_positions, cls_index, is_impossible):
+                if x is not None and x.dim() > 1:
+                    x.squeeze_(-1)
+            # during training, compute the end logits based on the ground truth of the start position
+            end_logits = self.end_logits(hidden_states, start_positions=start_positions, p_mask=p_mask)
+            loss_fct = CrossEntropyLoss()
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+            if cls_index is not None and is_impossible is not None:
+                # Predict answerability from the representation of CLS and START
+                cls_logits = self.answer_class(hidden_states, start_positions=start_positions, cls_index=cls_index)
+                loss_fct_cls = nn.BCEWithLogitsLoss()
+                cls_loss = loss_fct_cls(cls_logits, is_impossible)
+                # note(zhiliny): by default multiply the loss by 0.5 so that the scale is comparable to start_loss and end_loss
+                total_loss += cls_loss * 0.5
+            outputs = (total_loss,) + outputs
+        else:
+            # during inference, compute the end logits based on beam search
+            bsz, slen, hsz = hidden_states.size()
+            start_log_probs = F.softmax(start_logits, dim=-1)  # shape (bsz, slen)
+            start_top_log_probs, start_top_index = torch.topk(
+                start_log_probs, self.start_n_top, dim=-1
+            )  # shape (bsz, start_n_top)
+            start_top_index_exp = start_top_index.unsqueeze(-1).expand(-1, -1, hsz)  # shape (bsz, start_n_top, hsz)
+            start_states = torch.gather(hidden_states, -2, start_top_index_exp)  # shape (bsz, start_n_top, hsz)
+            start_states = start_states.unsqueeze(1).expand(-1, slen, -1, -1)  # shape (bsz, slen, start_n_top, hsz)
+            hidden_states_expanded = hidden_states.unsqueeze(2).expand_as(
+                start_states
+            )  # shape (bsz, slen, start_n_top, hsz)
+            p_mask = p_mask.unsqueeze(-1) if p_mask is not None else None
+            end_logits = self.end_logits(hidden_states_expanded, start_states=start_states, p_mask=p_mask)
+            end_log_probs = F.softmax(end_logits, dim=1)  # shape (bsz, slen, start_n_top)
+            end_top_log_probs, end_top_index = torch.topk(
+                end_log_probs, self.end_n_top, dim=1
+            )  # shape (bsz, end_n_top, start_n_top)
+            end_top_log_probs = end_top_log_probs.view(-1, self.start_n_top * self.end_n_top)
+            end_top_index = end_top_index.view(-1, self.start_n_top * self.end_n_top)
+            start_states = torch.einsum("blh,bl->bh", hidden_states, start_log_probs)
+            cls_logits = self.answer_class(hidden_states, start_states=start_states, cls_index=cls_index)
+            outputs = (start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits,) + outputs
+        # return start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits
+        # or (if labels are provided) (total_loss,)
+        return outputs
+class SequenceSummary(nn.Module):
+    r""" Compute a single vector summary of a sequence hidden states according to various possibilities:
+        Args of the config class:
+            summary_type:
+                - 'last' => [default] take the last token hidden state (like XLNet)
+                - 'first' => take the first token hidden state (like Bert)
+                - 'mean' => take the mean of all tokens hidden states
+                - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
+                - 'attn' => Not implemented now, use multi-head attention
+            summary_use_proj: Add a projection after the vector extraction
+            summary_proj_to_labels: If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
+            summary_activation: 'tanh' or another string => add an activation to the output, Other => no activation. Default
+            summary_first_dropout: Add a dropout before the projection and activation
+            summary_last_dropout: Add a dropout after the projection and activation
+    """
+    def __init__(self, config: PretrainedConfig):
+        super().__init__()
+        self.summary_type = getattr(config, "summary_type", "last")
+        if self.summary_type == "attn":
+            # We should use a standard multi-head attention module with absolute positional embedding for that.
+            # Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276
+            # We can probably just use the multi-head attention module of PyTorch >=1.1.0
+            raise NotImplementedError
+        self.summary = Identity()
+        if hasattr(config, "summary_use_proj") and config.summary_use_proj:
+            if hasattr(config, "summary_proj_to_labels") and config.summary_proj_to_labels and config.num_labels > 0:
+                num_classes = config.num_labels
+            else:
+                num_classes = config.hidden_size
+            self.summary = nn.Linear(config.hidden_size, num_classes)
+        activation_string = getattr(config, "summary_activation", None)
+        self.activation: Callable = (get_activation(activation_string) if activation_string else Identity())
+        self.first_dropout = Identity()
+        if hasattr(config, "summary_first_dropout") and config.summary_first_dropout > 0:
+            self.first_dropout = nn.Dropout(config.summary_first_dropout)
+        self.last_dropout = Identity()
+        if hasattr(config, "summary_last_dropout") and config.summary_last_dropout > 0:
+            self.last_dropout = nn.Dropout(config.summary_last_dropout)
+    def forward(self, hidden_states, cls_index=None):
+        """ hidden_states: float Tensor in shape [bsz, ..., seq_len, hidden_size], the hidden-states of the last layer.
+            cls_index: [optional] position of the classification token if summary_type == 'cls_index',
+                shape (bsz,) or more generally (bsz, ...) where ... are optional leading dimensions of hidden_states.
+                if summary_type == 'cls_index' and cls_index is None:
+                    we take the last token of the sequence as classification token
+        """
+        if self.summary_type == "last":
+            output = hidden_states[:, -1]
+        elif self.summary_type == "first":
+            output = hidden_states[:, 0]
+        elif self.summary_type == "mean":
+            output = hidden_states.mean(dim=1)
+        elif self.summary_type == "cls_index":
+            if cls_index is None:
+                cls_index = torch.full_like(hidden_states[..., :1, :], hidden_states.shape[-2] - 1, dtype=torch.long,)
+            else:
+                cls_index = cls_index.unsqueeze(-1).unsqueeze(-1)
+                cls_index = cls_index.expand((-1,) * (cls_index.dim() - 1) + (hidden_states.size(-1),))
+            # shape of cls_index: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states
+            output = hidden_states.gather(-2, cls_index).squeeze(-2)  # shape (bsz, XX, hidden_size)
+        elif self.summary_type == "attn":
+            raise NotImplementedError
+        output = self.first_dropout(output)
+        output = self.summary(output)
+        output = self.activation(output)
+        output = self.last_dropout(output)
+        return output
+def prune_linear_layer(layer, index, dim=0):
+    """ Prune a linear layer (a model parameters) to keep only entries in index.
+        Return the pruned layer as a new layer with requires_grad=True.
+        Used to remove heads.
+    """
+    index = index.to(layer.weight.device)
+    W = layer.weight.index_select(dim, index).clone().detach()
+    if layer.bias is not None:
+        if dim == 1:
+            b = layer.bias.clone().detach()
+        else:
+            b = layer.bias[index].clone().detach()
+    new_size = list(layer.weight.size())
+    new_size[dim] = len(index)
+    new_layer = nn.Linear(new_size[1], new_size[0], bias=layer.bias is not None).to(layer.weight.device)
+    new_layer.weight.requires_grad = False
+    new_layer.weight.copy_(W.contiguous())
+    new_layer.weight.requires_grad = True
+    if layer.bias is not None:
+        new_layer.bias.requires_grad = False
+        new_layer.bias.copy_(b.contiguous())
+        new_layer.bias.requires_grad = True
+    return new_layer
+def prune_conv1d_layer(layer, index, dim=1):
+    """ Prune a Conv1D layer (a model parameters) to keep only entries in index.
+        A Conv1D work as a Linear layer (see e.g. BERT) but the weights are transposed.
+        Return the pruned layer as a new layer with requires_grad=True.
+        Used to remove heads.
+    """
+    index = index.to(layer.weight.device)
+    W = layer.weight.index_select(dim, index).clone().detach()
+    if dim == 0:
+        b = layer.bias.clone().detach()
+    else:
+        b = layer.bias[index].clone().detach()
+    new_size = list(layer.weight.size())
+    new_size[dim] = len(index)
+    new_layer = Conv1D(new_size[1], new_size[0]).to(layer.weight.device)
+    new_layer.weight.requires_grad = False
+    new_layer.weight.copy_(W.contiguous())
+    new_layer.weight.requires_grad = True
+    new_layer.bias.requires_grad = False
+    new_layer.bias.copy_(b.contiguous())
+    new_layer.bias.requires_grad = True
+    return new_layer
+def prune_layer(layer, index, dim=None):
+    """ Prune a Conv1D or nn.Linear layer (a model parameters) to keep only entries in index.
+        Return the pruned layer as a new layer with requires_grad=True.
+        Used to remove heads.
+    """
+    if isinstance(layer, nn.Linear):
+        return prune_linear_layer(layer, index, dim=0 if dim is None else dim)
+    elif isinstance(layer, Conv1D):
+        return prune_conv1d_layer(layer, index, dim=1 if dim is None else dim)
+    else:
+        raise ValueError("Can't prune layer of class {}".format(layer.__class__))
+def apply_chunking_to_forward(
+    chunk_size: int, chunk_dim: int, forward_fn: Callable[..., torch.Tensor], *input_tensors
+) -> torch.Tensor:
+    """
+    This function chunks the `input_tensors` into smaller input tensor parts of size `chunk_size` over the dimension `chunk_dim`.
+    It then applies a layer `forward_fn` to each chunk independently to save memory.
+    If the `forward_fn` is independent across the `chunk_dim` this function will yield the
+    same result as not applying it.
+    Args:
+        chunk_size: int - the chunk size of a chunked tensor. `num_chunks` = `len(input_tensors[0]) / chunk_size`
+        chunk_dim: int - the dimension over which the input_tensors should be chunked
+        forward_fn: fn - the forward fn of the model
+        input_tensors: tuple(torch.Tensor) - the input tensors of `forward_fn` which are chunked
+    Returns:
+        a Tensor with the same shape the foward_fn would have given if applied
+    Examples::
+        # rename the usual forward() fn to forward_chunk()
+        def forward_chunk(self, hidden_states):
+            hidden_states = self.decoder(hidden_states)
+            return hidden_states
+        # implement a chunked forward function
+        def forward(self, hidden_states):
+            return apply_chunking_to_forward(self.chunk_size_lm_head, self.seq_len_dim, self.forward_chunk, hidden_states)
+    """
+    assert len(input_tensors) > 0, "{} has to be a tuple/list of tensors".format(input_tensors)
+    tensor_shape = input_tensors[0].shape
+    assert all(
+        input_tensor.shape == tensor_shape for input_tensor in input_tensors
+    ), "All input tenors have to be of the same shape"
+    # inspect.signature exist since python 3.5 and is a python method -> no problem with backward compability
+    num_args_in_forward_chunk_fn = len(inspect.signature(forward_fn).parameters)
+    assert num_args_in_forward_chunk_fn == len(
+        input_tensors
+    ), "forward_chunk_fn expects {} arguments, but only {} input tensors are given".format(
+        num_args_in_forward_chunk_fn, len(input_tensors)
+    )
+    if chunk_size > 0:
+        assert (
+            input_tensors[0].shape[chunk_dim] % chunk_size == 0
+        ), "The dimension to be chunked {} has to be a multiple of the chunk size {}".format(
+            input_tensors[0].shape[chunk_dim], chunk_size
+        )
+        num_chunks = input_tensors[0].shape[chunk_dim] // chunk_size
+        # chunk input tensor into tuples
+        input_tensors_chunks = tuple(input_tensor.chunk(num_chunks, dim=chunk_dim) for input_tensor in input_tensors)
+        # apply forward fn to every tuple
+        output_chunks = tuple(forward_fn(*input_tensors_chunk) for input_tensors_chunk in zip(*input_tensors_chunks))
+        # concatenate output at same dimension
+        return torch.cat(output_chunks, dim=chunk_dim)
+    return forward_fn(*input_tensors)

CGFormer/bert/tokenization_bert.py ADDED Viewed

	@@ -0,0 +1,545 @@

+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes."""
+import collections
+import logging
+import os
+import unicodedata
+from typing import List, Optional
+from .tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
+logger = logging.getLogger(__name__)
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "bert-base-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt",
+        "bert-large-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-vocab.txt",
+        "bert-base-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt",
+        "bert-large-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-vocab.txt",
+        "bert-base-multilingual-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-uncased-vocab.txt",
+        "bert-base-multilingual-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-multilingual-cased-vocab.txt",
+        "bert-base-chinese": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt",
+        "bert-base-german-cased": "https://int-deepset-models-bert.s3.eu-central-1.amazonaws.com/pytorch/bert-base-german-cased-vocab.txt",
+        "bert-large-uncased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-vocab.txt",
+        "bert-large-cased-whole-word-masking": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-vocab.txt",
+        "bert-large-uncased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-vocab.txt",
+        "bert-large-cased-whole-word-masking-finetuned-squad": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-vocab.txt",
+        "bert-base-cased-finetuned-mrpc": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-vocab.txt",
+        "bert-base-german-dbmdz-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-vocab.txt",
+        "bert-base-german-dbmdz-uncased": "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-vocab.txt",
+        "TurkuNLP/bert-base-finnish-cased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/vocab.txt",
+        "TurkuNLP/bert-base-finnish-uncased-v1": "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/vocab.txt",
+        "wietsedv/bert-base-dutch-cased": "https://s3.amazonaws.com/models.huggingface.co/bert/wietsedv/bert-base-dutch-cased/vocab.txt",
+    }
+}
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "bert-base-uncased": 512,
+    "bert-large-uncased": 512,
+    "bert-base-cased": 512,
+    "bert-large-cased": 512,
+    "bert-base-multilingual-uncased": 512,
+    "bert-base-multilingual-cased": 512,
+    "bert-base-chinese": 512,
+    "bert-base-german-cased": 512,
+    "bert-large-uncased-whole-word-masking": 512,
+    "bert-large-cased-whole-word-masking": 512,
+    "bert-large-uncased-whole-word-masking-finetuned-squad": 512,
+    "bert-large-cased-whole-word-masking-finetuned-squad": 512,
+    "bert-base-cased-finetuned-mrpc": 512,
+    "bert-base-german-dbmdz-cased": 512,
+    "bert-base-german-dbmdz-uncased": 512,
+    "TurkuNLP/bert-base-finnish-cased-v1": 512,
+    "TurkuNLP/bert-base-finnish-uncased-v1": 512,
+    "wietsedv/bert-base-dutch-cased": 512,
+}
+PRETRAINED_INIT_CONFIGURATION = {
+    "bert-base-uncased": {"do_lower_case": True},
+    "bert-large-uncased": {"do_lower_case": True},
+    "bert-base-cased": {"do_lower_case": False},
+    "bert-large-cased": {"do_lower_case": False},
+    "bert-base-multilingual-uncased": {"do_lower_case": True},
+    "bert-base-multilingual-cased": {"do_lower_case": False},
+    "bert-base-chinese": {"do_lower_case": False},
+    "bert-base-german-cased": {"do_lower_case": False},
+    "bert-large-uncased-whole-word-masking": {"do_lower_case": True},
+    "bert-large-cased-whole-word-masking": {"do_lower_case": False},
+    "bert-large-uncased-whole-word-masking-finetuned-squad": {"do_lower_case": True},
+    "bert-large-cased-whole-word-masking-finetuned-squad": {"do_lower_case": False},
+    "bert-base-cased-finetuned-mrpc": {"do_lower_case": False},
+    "bert-base-german-dbmdz-cased": {"do_lower_case": False},
+    "bert-base-german-dbmdz-uncased": {"do_lower_case": True},
+    "TurkuNLP/bert-base-finnish-cased-v1": {"do_lower_case": False},
+    "TurkuNLP/bert-base-finnish-uncased-v1": {"do_lower_case": True},
+    "wietsedv/bert-base-dutch-cased": {"do_lower_case": False},
+}
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    with open(vocab_file, "r", encoding="utf-8") as reader:
+        tokens = reader.readlines()
+    for index, token in enumerate(tokens):
+        token = token.rstrip("\n")
+        vocab[token] = index
+    return vocab
+def whitespace_tokenize(text):
+    """Runs basic whitespace cleaning and splitting on a piece of text."""
+    text = text.strip()
+    if not text:
+        return []
+    tokens = text.split()
+    return tokens
+class BertTokenizer(PreTrainedTokenizer):
+    r"""
+    Constructs a BERT tokenizer. Based on WordPiece.
+    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
+    should refer to the superclass for more information regarding methods.
+    Args:
+        vocab_file (:obj:`string`):
+            File containing the vocabulary.
+        do_lower_case (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether to lowercase the input when tokenizing.
+        do_basic_tokenize (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether to do basic tokenization before WordPiece.
+        never_split (:obj:`Iterable`, `optional`, defaults to :obj:`None`):
+            Collection of tokens which will never be split during tokenization. Only has an effect when
+            :obj:`do_basic_tokenize=True`
+        unk_token (:obj:`string`, `optional`, defaults to "[UNK]"):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        sep_token (:obj:`string`, `optional`, defaults to "[SEP]"):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
+            for sequence classification or for a text and a question for question answering.
+            It is also used as the last token of a sequence built with special tokens.
+        pad_token (:obj:`string`, `optional`, defaults to "[PAD]"):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (:obj:`string`, `optional`, defaults to "[CLS]"):
+            The classifier token which is used when doing sequence classification (classification of the whole
+            sequence instead of per-token classification). It is the first token of the sequence when built with
+            special tokens.
+        mask_token (:obj:`string`, `optional`, defaults to "[MASK]"):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        tokenize_chinese_chars (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether to tokenize Chinese characters.
+            This should likely be deactivated for Japanese:
+            see: https://github.com/huggingface/transformers/issues/328
+    """
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    def __init__(
+        self,
+        vocab_file,
+        do_lower_case=True,
+        do_basic_tokenize=True,
+        never_split=None,
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        tokenize_chinese_chars=True,
+        **kwargs
+    ):
+        super().__init__(
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            **kwargs,
+        )
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
+                "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)
+            )
+        self.vocab = load_vocab(vocab_file)
+        self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
+        self.do_basic_tokenize = do_basic_tokenize
+        if do_basic_tokenize:
+            self.basic_tokenizer = BasicTokenizer(
+                do_lower_case=do_lower_case, never_split=never_split, tokenize_chinese_chars=tokenize_chinese_chars
+            )
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=self.unk_token)
+    @property
+    def vocab_size(self):
+        return len(self.vocab)
+    def get_vocab(self):
+        return dict(self.vocab, **self.added_tokens_encoder)
+    def _tokenize(self, text):
+        split_tokens = []
+        if self.do_basic_tokenize:
+            for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
+                # If the token is part of the never_split set
+                if token in self.basic_tokenizer.never_split:
+                    split_tokens.append(token)
+                else:
+                    split_tokens += self.wordpiece_tokenizer.tokenize(token)
+        else:
+            split_tokens = self.wordpiece_tokenizer.tokenize(text)
+        return split_tokens
+    def _convert_token_to_id(self, token):
+        """ Converts a token (str) in an id using the vocab. """
+        return self.vocab.get(token, self.vocab.get(self.unk_token))
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.ids_to_tokens.get(index, self.unk_token)
+    def convert_tokens_to_string(self, tokens):
+        """ Converts a sequence of tokens (string) in a single string. """
+        out_string = " ".join(tokens).replace(" ##", "").strip()
+        return out_string
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
+        by concatenating and adding special tokens.
+        A BERT sequence has the following format:
+        - single sequence: ``[CLS] X [SEP]``
+        - pair of sequences: ``[CLS] A [SEP] B [SEP]``
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs to which the special tokens will be added
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + token_ids_1 + sep
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer ``prepare_for_model`` method.
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of ids.
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Set to True if the token list is already formatted with special tokens for the model
+        Returns:
+            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            if token_ids_1 is not None:
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formated with special tokens for the model."
+                )
+            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
+        A BERT sequence pair mask has the following format:
+        ::
+            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
+            | first sequence    | second sequence |
+        if token_ids_1 is None, only returns the first portion of the mask (0's).
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of ids.
+            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
+            sequence(s).
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
+    def save_vocabulary(self, vocab_path):
+        """
+        Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
+        Args:
+            vocab_path (:obj:`str`):
+                The directory in which to save the vocabulary.
+        Returns:
+            :obj:`Tuple(str)`: Paths to the files saved.
+        """
+        index = 0
+        if os.path.isdir(vocab_path):
+            vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES["vocab_file"])
+        else:
+            vocab_file = vocab_path
+        with open(vocab_file, "w", encoding="utf-8") as writer:
+            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        "Saving vocabulary to {}: vocabulary indices are not consecutive."
+                        " Please check that the vocabulary is not corrupted!".format(vocab_file)
+                    )
+                    index = token_index
+                writer.write(token + "\n")
+                index += 1
+        return (vocab_file,)
+class BasicTokenizer(object):
+    """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
+    def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True):
+        """ Constructs a BasicTokenizer.
+        Args:
+            **do_lower_case**: Whether to lower case the input.
+            **never_split**: (`optional`) list of str
+                Kept for backward compatibility purposes.
+                Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`)
+                List of token not to split.
+            **tokenize_chinese_chars**: (`optional`) boolean (default True)
+                Whether to tokenize Chinese characters.
+                This should likely be deactivated for Japanese:
+                see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328
+        """
+        if never_split is None:
+            never_split = []
+        self.do_lower_case = do_lower_case
+        self.never_split = set(never_split)
+        self.tokenize_chinese_chars = tokenize_chinese_chars
+    def tokenize(self, text, never_split=None):
+        """ Basic Tokenization of a piece of text.
+            Split on "white spaces" only, for sub-word tokenization, see WordPieceTokenizer.
+        Args:
+            **never_split**: (`optional`) list of str
+                Kept for backward compatibility purposes.
+                Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`)
+                List of token not to split.
+        """
+        # union() returns a new set by concatenating the two sets.
+        never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
+        # This was added on November 1st, 2018 for the multilingual and Chinese
+        # models. This is also applied to the English models now, but it doesn't
+        # matter since the English models were not trained on any Chinese data
+        # and generally don't have any Chinese data in them (there are Chinese
+        # characters in the vocabulary because Wikipedia does have some Chinese
+        # words in the English Wikipedia.).
+        if self.tokenize_chinese_chars:
+            text = self._tokenize_chinese_chars(text)
+        orig_tokens = whitespace_tokenize(text)
+        split_tokens = []
+        for token in orig_tokens:
+            if self.do_lower_case and token not in never_split:
+                token = token.lower()
+                token = self._run_strip_accents(token)
+            split_tokens.extend(self._run_split_on_punc(token, never_split))
+        output_tokens = whitespace_tokenize(" ".join(split_tokens))
+        return output_tokens
+    def _run_strip_accents(self, text):
+        """Strips accents from a piece of text."""
+        text = unicodedata.normalize("NFD", text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == "Mn":
+                continue
+            output.append(char)
+        return "".join(output)
+    def _run_split_on_punc(self, text, never_split=None):
+        """Splits punctuation on a piece of text."""
+        if never_split is not None and text in never_split:
+            return [text]
+        chars = list(text)
+        i = 0
+        start_new_word = True
+        output = []
+        while i < len(chars):
+            char = chars[i]
+            if _is_punctuation(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
+        return ["".join(x) for x in output]
+    def _tokenize_chinese_chars(self, text):
+        """Adds whitespace around any CJK character."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if self._is_chinese_char(cp):
+                output.append(" ")
+                output.append(char)
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+    def _is_chinese_char(self, cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if (
+            (cp >= 0x4E00 and cp <= 0x9FFF)
+            or (cp >= 0x3400 and cp <= 0x4DBF)  #
+            or (cp >= 0x20000 and cp <= 0x2A6DF)  #
+            or (cp >= 0x2A700 and cp <= 0x2B73F)  #
+            or (cp >= 0x2B740 and cp <= 0x2B81F)  #
+            or (cp >= 0x2B820 and cp <= 0x2CEAF)  #
+            or (cp >= 0xF900 and cp <= 0xFAFF)
+            or (cp >= 0x2F800 and cp <= 0x2FA1F)  #
+        ):  #
+            return True
+        return False
+    def _clean_text(self, text):
+        """Performs invalid character removal and whitespace cleanup on text."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if cp == 0 or cp == 0xFFFD or _is_control(char):
+                continue
+            if _is_whitespace(char):
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+class WordpieceTokenizer(object):
+    """Runs WordPiece tokenization."""
+    def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.max_input_chars_per_word = max_input_chars_per_word
+    def tokenize(self, text):
+        """Tokenizes a piece of text into its word pieces.
+        This uses a greedy longest-match-first algorithm to perform tokenization
+        using the given vocabulary.
+        For example:
+          input = "unaffable"
+          output = ["un", "##aff", "##able"]
+        Args:
+          text: A single token or whitespace separated tokens. This should have
+            already been passed through `BasicTokenizer`.
+        Returns:
+          A list of wordpiece tokens.
+        """
+        output_tokens = []
+        for token in whitespace_tokenize(text):
+            chars = list(token)
+            if len(chars) > self.max_input_chars_per_word:
+                output_tokens.append(self.unk_token)
+                continue
+            is_bad = False
+            start = 0
+            sub_tokens = []
+            while start < len(chars):
+                end = len(chars)
+                cur_substr = None
+                while start < end:
+                    substr = "".join(chars[start:end])
+                    if start > 0:
+                        substr = "##" + substr
+                    if substr in self.vocab:
+                        cur_substr = substr
+                        break
+                    end -= 1
+                if cur_substr is None:
+                    is_bad = True
+                    break
+                sub_tokens.append(cur_substr)
+                start = end
+            if is_bad:
+                output_tokens.append(self.unk_token)
+            else:
+                output_tokens.extend(sub_tokens)
+        return output_tokens

CGFormer/bert/tokenization_utils.py ADDED Viewed

	@@ -0,0 +1,723 @@

+# coding=utf-8
+# Copyright 2020 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Tokenization classes for python tokenizers.
+    For fast tokenizers (provided by HuggingFace's tokenizers library) see tokenization_utils_fast.py
+"""
+import itertools
+import logging
+import re
+import unicodedata
+from typing import Dict, List, Optional, Tuple, Union
+from .file_utils import add_end_docstrings
+from .tokenization_utils_base import (
+    ENCODE_KWARGS_DOCSTRING,
+    ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING,
+    AddedToken,
+    BatchEncoding,
+    EncodedInput,
+    EncodedInputPair,
+    PaddingStrategy,
+    PreTokenizedInput,
+    PreTokenizedInputPair,
+    PreTrainedTokenizerBase,
+    TensorType,
+    TextInput,
+    TextInputPair,
+    TruncationStrategy,
+)
+logger = logging.getLogger(__name__)
+def _is_whitespace(char):
+    """Checks whether `chars` is a whitespace character."""
+    # \t, \n, and \r are technically contorl characters but we treat them
+    # as whitespace since they are generally considered as such.
+    if char == " " or char == "\t" or char == "\n" or char == "\r":
+        return True
+    cat = unicodedata.category(char)
+    if cat == "Zs":
+        return True
+    return False
+def _is_control(char):
+    """Checks whether `chars` is a control character."""
+    # These are technically control characters but we count them as whitespace
+    # characters.
+    if char == "\t" or char == "\n" or char == "\r":
+        return False
+    cat = unicodedata.category(char)
+    if cat.startswith("C"):
+        return True
+    return False
+def _is_punctuation(char):
+    """Checks whether `chars` is a punctuation character."""
+    cp = ord(char)
+    # We treat all non-letter/number ASCII as punctuation.
+    # Characters such as "^", "$", and "`" are not in the Unicode
+    # Punctuation class but we treat them as punctuation anyways, for
+    # consistency.
+    if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126):
+        return True
+    cat = unicodedata.category(char)
+    if cat.startswith("P"):
+        return True
+    return False
+def _is_end_of_word(text):
+    """Checks whether the last character in text is one of a punctuation, control or whitespace character."""
+    last_char = text[-1]
+    return bool(_is_control(last_char) | _is_punctuation(last_char) | _is_whitespace(last_char))
+def _is_start_of_word(text):
+    """Checks whether the first character in text is one of a punctuation, control or whitespace character."""
+    first_char = text[0]
+    return bool(_is_control(first_char) | _is_punctuation(first_char) | _is_whitespace(first_char))
+class PreTrainedTokenizer(PreTrainedTokenizerBase):
+    """ Base class for all slow tokenizers.
+    Handle all the shared methods for tokenization and special tokens as well as methods
+    downloading/caching/loading pretrained tokenizers as well as adding tokens to the vocabulary.
+    This class also contain the added tokens in a unified way on top of all tokenizers so we don't
+    have to handle the specific vocabulary augmentation methods of the various underlying
+    dictionary structures (BPE, sentencepiece...).
+    Class attributes (overridden by derived classes):
+    - ``vocab_files_names``: a python ``dict`` with, as keys, the ``__init__`` keyword name of each vocabulary file
+      required by the model, and as associated values, the filename for saving the associated file (string).
+    - ``pretrained_vocab_files_map``: a python ``dict of dict`` the high-level keys
+      being the ``__init__`` keyword name of each vocabulary file required by the model, the low-level being the
+      `short-cut-names` (string) of the pretrained models with, as associated values, the `url` (string) to the
+      associated pretrained vocabulary file.
+    - ``max_model_input_sizes``: a python ``dict`` with, as keys, the `short-cut-names` (string) of the pretrained
+      models, and as associated values, the maximum length of the sequence inputs of this model, or None if the
+      model has no maximum input size.
+    - ``pretrained_init_configuration``: a python ``dict`` with, as keys, the `short-cut-names` (string) of the
+      pretrained models, and as associated values, a dictionnary of specific arguments to pass to the
+      ``__init__``method of the tokenizer class for this pretrained model when loading the tokenizer with the
+      ``from_pretrained()`` method.
+    Args:
+        - ``model_max_length``: (`Optional`) int: the maximum length in number of tokens for the inputs to the transformer model.
+            When the tokenizer is loaded with `from_pretrained`, this will be set to the value stored for the associated
+            model in ``max_model_input_sizes`` (see above). If no value is provided, will default to VERY_LARGE_INTEGER (`int(1e30)`).
+            no associated max_length can be found in ``max_model_input_sizes``.
+        - ``padding_side``: (`Optional`) string: the side on which the model should have padding applied.
+            Should be selected between ['right', 'left']
+        - ``model_input_names``: (`Optional`) List[string]: the list of the forward pass inputs accepted by the
+            model ("token_type_ids", "attention_mask"...).
+        - ``bos_token``: (`Optional`) string: a beginning of sentence token.
+            Will be associated to ``self.bos_token`` and ``self.bos_token_id``
+        - ``eos_token``: (`Optional`) string: an end of sentence token.
+            Will be associated to ``self.eos_token`` and ``self.eos_token_id``
+        - ``unk_token``: (`Optional`) string: an unknown token.
+            Will be associated to ``self.unk_token`` and ``self.unk_token_id``
+        - ``sep_token``: (`Optional`) string: a separation token (e.g. to separate context and query in an input sequence).
+            Will be associated to ``self.sep_token`` and ``self.sep_token_id``
+        - ``pad_token``: (`Optional`) string: a padding token.
+            Will be associated to ``self.pad_token`` and ``self.pad_token_id``
+        - ``cls_token``: (`Optional`) string: a classification token (e.g. to extract a summary of an input sequence
+            leveraging self-attention along the full depth of the model).
+            Will be associated to ``self.cls_token`` and ``self.cls_token_id``
+        - ``mask_token``: (`Optional`) string: a masking token (e.g. when training a model with masked-language
+            modeling). Will be associated to ``self.mask_token`` and ``self.mask_token_id``
+        - ``additional_special_tokens``: (`Optional`) list: a list of additional special tokens.
+            Adding all special tokens here ensure they won't be split by the tokenization process.
+            Will be associated to ``self.additional_special_tokens`` and ``self.additional_special_tokens_ids``
+    .. automethod:: __call__
+    """
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        # Added tokens - We store this for both slow and fast tokenizers
+        # until the serialization of Fast tokenizers is updated
+        self.added_tokens_encoder: Dict[str, int] = {}
+        self.added_tokens_decoder: Dict[int, str] = {}
+        self.unique_no_split_tokens: List[str] = []
+    @property
+    def is_fast(self) -> bool:
+        return False
+    @property
+    def vocab_size(self) -> int:
+        """ Size of the base vocabulary (without the added tokens) """
+        raise NotImplementedError
+    def get_vocab(self):
+        """ Returns the vocabulary as a dict of {token: index} pairs. `tokenizer.get_vocab()[token]` is equivalent to `tokenizer.convert_tokens_to_ids(token)` when `token` is in the vocab. """
+        raise NotImplementedError()
+    def get_added_vocab(self) -> Dict[str, int]:
+        return self.added_tokens_encoder
+    def __len__(self):
+        """ Size of the full vocabulary with the added tokens """
+        return self.vocab_size + len(self.added_tokens_encoder)
+    def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens=False) -> int:
+        """
+        Add a list of new tokens to the tokenizer class. If the new tokens are not in the
+        vocabulary, they are added to it with indices starting from length of the current vocabulary.
+        Args:
+            new_tokens: string or list of string. Each string is a token to add. Tokens are only added if they are not
+                already in the vocabulary (tested by checking if the tokenizer assign the index of the ``unk_token`` to them).
+        Returns:
+            Number of tokens added to the vocabulary.
+        Examples::
+            # Let's see how to increase the vocabulary of Bert model and tokenizer
+            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+            model = BertModel.from_pretrained('bert-base-uncased')
+            num_added_toks = tokenizer.add_tokens(['new_tok1', 'my_new-tok2'])
+            print('We have added', num_added_toks, 'tokens')
+            model.resize_token_embeddings(len(tokenizer))  # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
+        """
+        new_tokens = [str(tok) for tok in new_tokens]
+        tokens_to_add = []
+        for token in new_tokens:
+            assert isinstance(token, str)
+            if not special_tokens and self.init_kwargs.get("do_lower_case", False):
+                token = token.lower()
+            if (
+                token != self.unk_token
+                and self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token)
+                and token not in tokens_to_add
+            ):
+                tokens_to_add.append(token)
+                if self.verbose:
+                    logger.info("Adding %s to the vocabulary", token)
+        added_tok_encoder = dict((tok, len(self) + i) for i, tok in enumerate(tokens_to_add))
+        added_tok_decoder = {v: k for k, v in added_tok_encoder.items()}
+        self.added_tokens_encoder.update(added_tok_encoder)
+        self.added_tokens_decoder.update(added_tok_decoder)
+        # Make sure we don't split on any special tokens (even they were already in the vocab before e.g. for Albert)
+        if special_tokens:
+            self.unique_no_split_tokens = list(set(self.unique_no_split_tokens).union(set(new_tokens)))
+        else:
+            # Or on the newly added tokens
+            self.unique_no_split_tokens = list(set(self.unique_no_split_tokens).union(set(tokens_to_add)))
+        return len(tokens_to_add)
+    def num_special_tokens_to_add(self, pair=False):
+        """
+        Returns the number of added tokens when encoding a sequence with special tokens.
+        Note:
+            This encodes inputs and checks the number of added tokens, and is therefore not efficient. Do not put this
+            inside your training loop.
+        Args:
+            pair: Returns the number of added tokens in the case of a sequence pair if set to True, returns the
+                number of added tokens in the case of a single sequence if set to False.
+        Returns:
+            Number of tokens added to sequences
+        """
+        token_ids_0 = []
+        token_ids_1 = []
+        return len(self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 if pair else None))
+    def tokenize(self, text: TextInput, **kwargs):
+        """ Converts a string in a sequence of tokens (string), using the tokenizer.
+            Split in words for word-based vocabulary or sub-words for sub-word-based
+            vocabularies (BPE/SentencePieces/WordPieces).
+            Take care of added tokens.
+            Args:
+                text (:obj:`string`): The sequence to be encoded.
+                **kwargs (:obj: `dict`): Arguments passed to the model-specific `prepare_for_tokenization` preprocessing method.
+        """
+        # Simple mapping string => AddedToken for special tokens with specific tokenization behaviors
+        all_special_tokens_extended = dict(
+            (str(t), t) for t in self.all_special_tokens_extended if isinstance(t, AddedToken)
+        )
+        text, kwargs = self.prepare_for_tokenization(text, **kwargs)
+        if kwargs:
+            logger.warning(f"Keyword arguments {kwargs} not recognized.")
+        # TODO: should this be in the base class?
+        if self.init_kwargs.get("do_lower_case", False):
+            # convert non-special tokens to lowercase
+            escaped_special_toks = [re.escape(s_tok) for s_tok in self.all_special_tokens]
+            pattern = r"(" + r"|".join(escaped_special_toks) + r")|" + r"(.+?)"
+            text = re.sub(pattern, lambda m: m.groups()[0] or m.groups()[1].lower(), text)
+        def split_on_token(tok, text):
+            result = []
+            tok_extended = all_special_tokens_extended.get(tok, None)
+            split_text = text.split(tok)
+            full_word = ""
+            for i, sub_text in enumerate(split_text):
+                # AddedToken can control whitespace stripping around them.
+                # We use them for GPT2 and Roberta to have different behavior depending on the special token
+                # Cf. https://github.com/huggingface/transformers/pull/2778
+                # and https://github.com/huggingface/transformers/issues/3788
+                if isinstance(tok_extended, AddedToken):
+                    if tok_extended.single_word:
+                        # Try to avoid splitting on token
+                        if (
+                            i < len(split_text) - 1
+                            and not _is_end_of_word(sub_text)
+                            and not _is_start_of_word(split_text[i + 1])
+                        ):
+                            # Don't extract the special token
+                            full_word += sub_text + tok
+                        elif full_word:
+                            full_word += sub_text
+                            result += [full_word]
+                            full_word = ""
+                            continue
+                    # Strip white spaces on the right
+                    if tok_extended.rstrip and i > 0:
+                        # A bit counter-intuitive but we strip the left of the string
+                        # since tok_extended.rstrip means the special token is eating all white spaces on its right
+                        sub_text = sub_text.lstrip()
+                    # Strip white spaces on the left
+                    if tok_extended.lstrip and i < len(split_text) - 1:
+                        sub_text = sub_text.rstrip()  # Opposite here
+                else:
+                    # We strip left and right by default
+                    if i < len(split_text) - 1:
+                        sub_text = sub_text.rstrip()
+                    if i > 0:
+                        sub_text = sub_text.lstrip()
+                if i == 0 and not sub_text:
+                    result += [tok]
+                elif i == len(split_text) - 1:
+                    if sub_text:
+                        result += [sub_text]
+                    else:
+                        pass
+                else:
+                    if sub_text:
+                        result += [sub_text]
+                    result += [tok]
+            return result
+        def split_on_tokens(tok_list, text):
+            if not text.strip():
+                return []
+            if not tok_list:
+                return self._tokenize(text)
+            tokenized_text = []
+            text_list = [text]
+            for tok in tok_list:
+                tokenized_text = []
+                for sub_text in text_list:
+                    if sub_text not in self.unique_no_split_tokens:
+                        tokenized_text += split_on_token(tok, sub_text)
+                    else:
+                        tokenized_text += [sub_text]
+                text_list = tokenized_text
+            return list(
+                itertools.chain.from_iterable(
+                    (
+                        self._tokenize(token) if token not in self.unique_no_split_tokens else [token]
+                        for token in tokenized_text
+                    )
+                )
+            )
+        no_split_token = self.unique_no_split_tokens
+        tokenized_text = split_on_tokens(no_split_token, text)
+        return tokenized_text
+    def _tokenize(self, text, **kwargs):
+        """ Converts a string in a sequence of tokens (string), using the tokenizer.
+            Split in words for word-based vocabulary or sub-words for sub-word-based
+            vocabularies (BPE/SentencePieces/WordPieces).
+            Do NOT take care of added tokens.
+        """
+        raise NotImplementedError
+    def convert_tokens_to_ids(self, tokens):
+        """ Converts a token string (or a sequence of tokens) in a single integer id
+            (or a sequence of ids), using the vocabulary.
+        """
+        if tokens is None:
+            return None
+        if isinstance(tokens, str):
+            return self._convert_token_to_id_with_added_voc(tokens)
+        ids = []
+        for token in tokens:
+            ids.append(self._convert_token_to_id_with_added_voc(token))
+        return ids
+    def _convert_token_to_id_with_added_voc(self, token):
+        if token is None:
+            return None
+        if token in self.added_tokens_encoder:
+            return self.added_tokens_encoder[token]
+        return self._convert_token_to_id(token)
+    def _convert_token_to_id(self, token):
+        raise NotImplementedError
+    def _encode_plus(
+        self,
+        text: Union[TextInput, PreTokenizedInput, EncodedInput],
+        text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None,
+        add_special_tokens: bool = True,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        is_pretokenized: bool = False,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs
+    ) -> BatchEncoding:
+        def get_input_ids(text):
+            if isinstance(text, str):
+                tokens = self.tokenize(text, **kwargs)
+                return self.convert_tokens_to_ids(tokens)
+            elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str):
+                if is_pretokenized:
+                    tokens = list(itertools.chain(*(self.tokenize(t, is_pretokenized=True, **kwargs) for t in text)))
+                    return self.convert_tokens_to_ids(tokens)
+                else:
+                    return self.convert_tokens_to_ids(text)
+            elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
+                return text
+            else:
+                if is_pretokenized:
+                    raise ValueError(
+                        f"Input {text} is not valid. Should be a string or a list/tuple of strings when `is_pretokenized=True`."
+                    )
+                else:
+                    raise ValueError(
+                        f"Input {text} is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers."
+                    )
+        if return_offsets_mapping:
+            raise NotImplementedError(
+                "return_offset_mapping is not available when using Python tokenizers."
+                "To use this feature, change your tokenizer to one deriving from "
+                "transformers.PreTrainedTokenizerFast."
+                "More information on available tokenizers at "
+                "https://github.com/huggingface/transformers/pull/2674"
+            )
+        first_ids = get_input_ids(text)
+        second_ids = get_input_ids(text_pair) if text_pair is not None else None
+        return self.prepare_for_model(
+            first_ids,
+            pair_ids=second_ids,
+            add_special_tokens=add_special_tokens,
+            padding=padding_strategy.value,
+            truncation=truncation_strategy.value,
+            max_length=max_length,
+            stride=stride,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_tensors=return_tensors,
+            prepend_batch_axis=True,
+            return_attention_mask=return_attention_mask,
+            return_token_type_ids=return_token_type_ids,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_length=return_length,
+            verbose=verbose,
+        )
+    def _batch_encode_plus(
+        self,
+        batch_text_or_text_pairs: Union[
+            List[TextInput],
+            List[TextInputPair],
+            List[PreTokenizedInput],
+            List[PreTokenizedInputPair],
+            List[EncodedInput],
+            List[EncodedInputPair],
+        ],
+        add_special_tokens: bool = True,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        is_pretokenized: bool = False,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs
+    ) -> BatchEncoding:
+        def get_input_ids(text):
+            if isinstance(text, str):
+                tokens = self.tokenize(text, **kwargs)
+                return self.convert_tokens_to_ids(tokens)
+            elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str):
+                if is_pretokenized:
+                    tokens = list(itertools.chain(*(self.tokenize(t, is_pretokenized=True, **kwargs) for t in text)))
+                    return self.convert_tokens_to_ids(tokens)
+                else:
+                    return self.convert_tokens_to_ids(text)
+            elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
+                return text
+            else:
+                raise ValueError(
+                    "Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers."
+                )
+        if return_offsets_mapping:
+            raise NotImplementedError(
+                "return_offset_mapping is not available when using Python tokenizers."
+                "To use this feature, change your tokenizer to one deriving from "
+                "transformers.PreTrainedTokenizerFast."
+            )
+        input_ids = []
+        for ids_or_pair_ids in batch_text_or_text_pairs:
+            if not isinstance(ids_or_pair_ids, (list, tuple)):
+                ids, pair_ids = ids_or_pair_ids, None
+            elif is_pretokenized and not isinstance(ids_or_pair_ids[0], (list, tuple)):
+                ids, pair_ids = ids_or_pair_ids, None
+            else:
+                ids, pair_ids = ids_or_pair_ids
+            first_ids = get_input_ids(ids)
+            second_ids = get_input_ids(pair_ids) if pair_ids is not None else None
+            input_ids.append((first_ids, second_ids))
+        batch_outputs = self._batch_prepare_for_model(
+            input_ids,
+            add_special_tokens=add_special_tokens,
+            padding_strategy=padding_strategy,
+            truncation_strategy=truncation_strategy,
+            max_length=max_length,
+            stride=stride,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_attention_mask=return_attention_mask,
+            return_token_type_ids=return_token_type_ids,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_length=return_length,
+            return_tensors=return_tensors,
+            verbose=verbose,
+        )
+        return BatchEncoding(batch_outputs)
+    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
+    def _batch_prepare_for_model(
+        self,
+        batch_ids_pairs: List[Union[PreTokenizedInputPair, Tuple[List[int], None]]],
+        add_special_tokens: bool = True,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[str] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+    ) -> BatchEncoding:
+        """ Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model.
+        It adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
+        manages a moving window (with user defined stride) for overflowing tokens
+        Args:
+            batch_ids_pairs: list of tokenized input ids or input ids pairs
+        """
+        batch_outputs = {}
+        for first_ids, second_ids in batch_ids_pairs:
+            outputs = self.prepare_for_model(
+                first_ids,
+                second_ids,
+                add_special_tokens=add_special_tokens,
+                padding=PaddingStrategy.DO_NOT_PAD.value,  # we pad in batch afterward
+                truncation=truncation_strategy.value,
+                max_length=max_length,
+                stride=stride,
+                pad_to_multiple_of=None,  # we pad in batch afterward
+                return_attention_mask=False,  # we pad in batch afterward
+                return_token_type_ids=return_token_type_ids,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_length=return_length,
+                return_tensors=None,  # We convert the whole batch to tensors at the end
+                prepend_batch_axis=False,
+                verbose=verbose,
+            )
+            for key, value in outputs.items():
+                if key not in batch_outputs:
+                    batch_outputs[key] = []
+                batch_outputs[key].append(value)
+        batch_outputs = self.pad(
+            batch_outputs,
+            padding=padding_strategy.value,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_attention_mask=return_attention_mask,
+        )
+        batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors)
+        return batch_outputs
+    def prepare_for_tokenization(self, text: str, is_pretokenized=False, **kwargs) -> (str, dict):
+        """ Performs any necessary transformations before tokenization.
+            This method should pop the arguments from kwargs and return kwargs as well.
+            We test kwargs at the end of the encoding process to be sure all the arguments have been used.
+        """
+        return (text, kwargs)
+    def get_special_tokens_mask(
+        self, token_ids_0: List, token_ids_1: Optional[List] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer ``prepare_for_model`` method.
+        Args:
+            token_ids_0: list of ids (must not contain special tokens)
+            token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
+                for sequence pairs
+            already_has_special_tokens: (default False) Set to True if the token list is already formated with
+                special tokens for the model
+        Returns:
+            A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        return [0] * ((len(token_ids_1) if token_ids_1 else 0) + len(token_ids_0))
+    def convert_ids_to_tokens(
+        self, ids: Union[int, List[int]], skip_special_tokens: bool = False
+    ) -> Union[str, List[str]]:
+        """ Converts a single index or a sequence of indices (integers) in a token "
+            (resp.) a sequence of tokens (str), using the vocabulary and added tokens.
+            Args:
+                skip_special_tokens: Don't decode special tokens (self.all_special_tokens). Default: False
+        """
+        if isinstance(ids, int):
+            if ids in self.added_tokens_decoder:
+                return self.added_tokens_decoder[ids]
+            else:
+                return self._convert_id_to_token(ids)
+        tokens = []
+        for index in ids:
+            index = int(index)
+            if skip_special_tokens and index in self.all_special_ids:
+                continue
+            if index in self.added_tokens_decoder:
+                tokens.append(self.added_tokens_decoder[index])
+            else:
+                tokens.append(self._convert_id_to_token(index))
+        return tokens
+    def _convert_id_to_token(self, index: int) -> str:
+        raise NotImplementedError
+    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        """ Converts a sequence of tokens (string) in a single string.
+            The most simple way to do it is ' '.join(self.convert_ids_to_tokens(token_ids))
+            but we often want to remove sub-word tokenization artifacts at the same time.
+        """
+        return " ".join(self.convert_ids_to_tokens(tokens))
+    def decode(
+        self, token_ids: List[int], skip_special_tokens: bool = False, clean_up_tokenization_spaces: bool = True
+    ) -> str:
+        filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
+        # To avoid mixing byte-level and unicode for byte-level BPT
+        # we need to build string separatly for added tokens and byte-level tokens
+        # cf. https://github.com/huggingface/transformers/issues/1133
+        sub_texts = []
+        current_sub_text = []
+        for token in filtered_tokens:
+            if skip_special_tokens and token in self.all_special_ids:
+                continue
+            if token in self.added_tokens_encoder:
+                if current_sub_text:
+                    sub_texts.append(self.convert_tokens_to_string(current_sub_text))
+                    current_sub_text = []
+                sub_texts.append(token)
+            else:
+                current_sub_text.append(token)
+        if current_sub_text:
+            sub_texts.append(self.convert_tokens_to_string(current_sub_text))
+        text = " ".join(sub_texts)
+        if clean_up_tokenization_spaces:
+            clean_text = self.clean_up_tokenization(text)
+            return clean_text
+        else:
+            return text
+    def save_vocabulary(self, save_directory) -> Tuple[str]:
+        """ Save the tokenizer vocabulary to a directory. This method does *NOT* save added tokens
+            and special token mappings.
+            Please use :func:`~transformers.PreTrainedTokenizer.save_pretrained` `()` to save the full
+            Tokenizer state if you want to reload it using the :func:`~transformers.PreTrainedTokenizer.from_pretrained`
+            class method.
+        """
+        raise NotImplementedError

CGFormer/bert/tokenization_utils_base.py ADDED Viewed

The diff for this file is too large to render. See raw diff

CGFormer/ckpts/swin_base_patch4_window12_384_22k.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:70812ab6b0a7a38712409d13976df9431632466eaacf991d5e90d9a1e91f3ab1
+size 450809979

CGFormer/config/config_gref_ace.yaml ADDED Viewed

	@@ -0,0 +1,63 @@

+DATA:
+  dataset: refcocog_u
+  train_split: train
+  train_lmdb: data/lmdb/refcocog_u/train.lmdb
+  val_split: val
+  val_lmdb: data/lmdb/refcocog_u/val.lmdb
+  mask_root: data/masks/refcocog_u
+TRAIN:
+  swin_type: base
+  swin_pretrain: ckpts/swin_base_patch4_window12_384_22k.pth
+  bert: bert-base-uncased
+  mha: '8-8-8-8'
+  input_size: 480
+  word_len: 20
+  word_dim: 768
+  vis_dim: 512
+  num_token: 2
+  token_dim: 512
+  sync_bn: True
+  dropout: 0.
+  fusion_drop: 0.
+  workers: 32  # data loader workers
+  workers_val: 8
+  batch_size: 64  # batch size for training
+  batch_size_val: 16  # batch size for validation during training, memory and speed tradeoff
+  start_epoch: 0
+  epochs: 50
+  lr_backbone: 5.e-5
+  lr_text_encoder: 5.e-5
+  lr: 1.e-4
+  weight_decay: 1.e-4
+  amsgrad: True
+  manual_seed:
+  print_freq: 100
+  exp_name: cgformer_test
+  output_folder: /data/seunghoon/CGFormer/exp/seunghoon
+  save_freq: 1
+  weight:
+  resume:
+  evaluate: True
+  metric_learning: True
+  exclude_multiobj: True
+  metric_mode: hardpos_only_refined
+  metric_loss_weight: 0.1
+  loss_option: ACE_verbonly
+  margin_value: 12
+  temperature: 0.07
+  hp_selection: strict
+  filter_threshold: 0.5
+  mixup_lasttwo : False
+  use_projections : False
+Distributed:
+  # dist_url: tcp://localhost:18123
+  dist_backend: 'nccl'
+  # multiprocessing_distributed: True
+  world_size: 1
+  # rank: 0
+TEST:
+  window12: True # if use window12 pretrained for training, testing set true
+  test_split: test
+  test_lmdb: data/lmdb/refcocog_u/test.lmdb
+  visualize: False

CGFormer/config/config_mosaic_refcocog_u.yaml ADDED Viewed

	@@ -0,0 +1,51 @@

+DATA:
+  dataset: refcocog_u
+  train_split: train
+  train_lmdb: data/lmdb/refcocog_u/train.lmdb
+  val_split: val
+  val_lmdb: data/lmdb/refcocog_u/val.lmdb
+  mask_root: data/masks/refcocog_u
+TRAIN:
+  swin_type: base
+  swin_pretrain: ckpts/swin_base_patch4_window12_384_22k.pth
+  bert: bert-base-uncased
+  mha: '8-8-8-8'
+  input_size: 480
+  word_len: 20
+  word_dim: 768
+  vis_dim: 512
+  num_token: 2
+  token_dim: 512
+  sync_bn: True
+  dropout: 0.
+  fusion_drop: 0.
+  workers: 32  # data loader workers
+  workers_val: 8
+  batch_size: 64  # batch size for training
+  batch_size_val: 16  # batch size for validation during training, memory and speed tradeoff
+  start_epoch: 0
+  epochs: 50
+  lr_backbone: 5.e-5
+  lr_text_encoder: 5.e-5
+  lr: 1.e-4
+  weight_decay: 1.e-4
+  amsgrad: True
+  manual_seed:
+  print_freq: 100
+  exp_name: cgformer
+  output_folder: exp/mosaic_refcocog_u/
+  save_freq: 1
+  weight:
+  resume:
+  evaluate: True
+Distributed:
+  dist_url: tcp://localhost:12345
+  dist_backend: 'nccl'
+  multiprocessing_distributed: True
+  world_size: 1
+  rank: 0
+TEST:
+  window12: True # if use window12 pretrained for training, testing set true
+  test_split: val
+  test_lmdb: data/lmdb/refcocog_u/val.lmdb
+  visualize: False

CGFormer/config/config_rcc_ace.yaml ADDED Viewed

	@@ -0,0 +1,63 @@

+DATA:
+  dataset: refcoco
+  train_split: train
+  train_lmdb: data/lmdb/refcoco/train.lmdb
+  val_split: val
+  val_lmdb: data/lmdb/refcoco/val.lmdb
+  mask_root: data/masks/refcoco
+TRAIN:
+  swin_type: base
+  swin_pretrain: ckpts/swin_base_patch4_window12_384_22k.pth
+  bert: bert-base-uncased
+  mha: '8-8-8-8'
+  input_size: 480
+  word_len: 20
+  word_dim: 768
+  vis_dim: 512
+  num_token: 2
+  token_dim: 512
+  sync_bn: True
+  dropout: 0.
+  fusion_drop: 0.
+  workers: 16  # data loader workers
+  workers_val: 8
+  batch_size: 64 #batch size for training
+  batch_size_val: 24  # batch size for validation during training, memory and speed tradeoff
+  start_epoch: 0
+  epochs: 50
+  lr_backbone: 5.e-5
+  lr_text_encoder: 5.e-5
+  lr: 1.e-4
+  weight_decay: 1.e-4
+  amsgrad: True
+  manual_seed:
+  print_freq: 100
+  exp_name: cgformer_test
+  output_folder: /data/seunghoon/CGFormer/exp/seunghoon
+  save_freq: 1
+  weight:
+  resume:
+  evaluate: True
+  metric_learning: False
+  exclude_multiobj: true
+  metric_mode: hardpos_only_refined
+  metric_loss_weight: 0.1
+  loss_option: ACE_verbonly
+  margin_value: 12
+  temperature: 0.07
+  hp_selection: strict
+  use_projections : True
+  mixup_lasttwo : False
+  filter_threshold: 0.68
+Distributed:
+  # dist_url: tcp://localhost:18123
+  dist_backend: 'nccl'
+  # multiprocessing_distributed: True
+  world_size: 1
+  # rank: 0
+TEST:
+  window12: True # if use window12 pretrained for training, testing set true
+  test_split: test
+  test_lmdb: data/lmdb/refcocog_u/test.lmdb
+  visualize: False

CGFormer/config/config_rccp_ace.yaml ADDED Viewed

	@@ -0,0 +1,63 @@

+DATA:
+  dataset: refcoco+
+  train_split: train
+  train_lmdb: data/lmdb/refcoco+/train.lmdb
+  val_split: val
+  val_lmdb: data/lmdb/refcoco+/val.lmdb
+  mask_root: data/masks/refcoco+
+TRAIN:
+  swin_type: base
+  swin_pretrain: ckpts/swin_base_patch4_window12_384_22k.pth
+  bert: bert-base-uncased
+  mha: '8-8-8-8'
+  input_size: 480
+  word_len: 20
+  word_dim: 768
+  vis_dim: 512
+  num_token: 2
+  token_dim: 512
+  sync_bn: True
+  dropout: 0.
+  fusion_drop: 0.
+  workers: 16  # data loader workers
+  workers_val: 8
+  batch_size: 64 #batch size for training
+  batch_size_val: 24  # batch size for validation during training, memory and speed tradeoff
+  start_epoch: 0
+  epochs: 50
+  lr_backbone: 5.e-5
+  lr_text_encoder: 5.e-5
+  lr: 1.e-4
+  weight_decay: 1.e-4
+  amsgrad: True
+  manual_seed:
+  print_freq: 100
+  exp_name: cgformer_test
+  output_folder: /data/seunghoon/CGFormer/exp/seunghoon
+  save_freq: 1
+  weight:
+  resume:
+  evaluate: True
+  metric_learning: False
+  exclude_multiobj: true
+  metric_mode: hardpos_only_refined
+  metric_loss_weight: 0.1
+  loss_option: ACE_verbonly
+  margin_value: 12
+  temperature: 0.07
+  hp_selection: strict
+  use_projections : True
+  mixup_lasttwo : False
+  filter_threshold: 0.68
+Distributed:
+  # dist_url: tcp://localhost:18123
+  dist_backend: 'nccl'
+  # multiprocessing_distributed: True
+  world_size: 1
+  # rank: 0
+TEST:
+  window12: True # if use window12 pretrained for training, testing set true
+  test_split: test
+  test_lmdb: data/lmdb/refcocog_u/test.lmdb
+  visualize: False

CGFormer/config/config_refzom_ace.yaml ADDED Viewed

	@@ -0,0 +1,64 @@

+DATA:
+  dataset: ref-zom
+  train_split: train
+  train_lmdb: /data2/projects/chaeyun/VerbCentric_RIS/datasets/lmdb/ref-zom/train.lmdb
+  val_split: test
+  val_lmdb: /data2/projects/chaeyun/VerbCentric_RIS/datasets/lmdb/ref-zom/test.lmdb
+  mask_root: /data2/projects/chaeyun/VerbCentric_RIS/datasets/masks/ref-zom
+TRAIN:
+  swin_type: base
+  swin_pretrain: ckpts/swin_base_patch4_window12_384_22k.pth
+  bert: bert-base-uncased
+  mha: '8-8-8-8'
+  input_size: 480
+  word_len: 20
+  word_dim: 768
+  vis_dim: 512
+  num_token: 2
+  token_dim: 512
+  sync_bn: True
+  dropout: 0.
+  fusion_drop: 0.
+  workers: 32  # data loader workers
+  workers_val: 8
+  batch_size: 64  # batch size for training
+  batch_size_val: 16  # batch size for validation during training, memory and speed tradeoff
+  start_epoch: 0
+  epochs: 50
+  lr_backbone: 5.e-5
+  lr_text_encoder: 5.e-5
+  lr: 1.e-4
+  weight_decay: 1.e-4
+  amsgrad: True
+  manual_seed:
+  print_freq: 100
+  exp_name: cgformer_test
+  output_folder: /data/seunghoon/CGFormer/exp/seunghoon
+  save_freq: 1
+  weight:
+  resume:
+  evaluate: True
+  metric_learning: True
+  exclude_multiobj: True
+  metric_mode: hardpos_only_refined
+  metric_loss_weight: 0.1
+  loss_option: ACE_verbonly
+  margin_value: 12
+  temperature: 0.07
+  hp_selection: strict
+  filter_threshold: 0.5
+  mixup_lasttwo : False
+  use_projections : False
+  fuse_mode : simple_attn
+Distributed:
+  # dist_url: tcp://localhost:18123
+  dist_backend: 'nccl'
+  # multiprocessing_distributed: True
+  world_size: 1
+  # rank: 0
+TEST:
+  window12: True # if use window12 pretrained for training, testing set true
+  test_split: test
+  test_lmdb: data/lmdb/refcocog_u/test.lmdb
+  visualize: False

CGFormer/config/config_refzom_repro.yaml ADDED Viewed

	@@ -0,0 +1,62 @@

+DATA:
+  dataset: ref-zom
+  train_split: train
+  train_lmdb: /data2/projects/chaeyun/VerbCentric_RIS/datasets/lmdb/ref-zom/train.lmdb
+  val_split: test
+  val_lmdb: /data2/projects/chaeyun/VerbCentric_RIS/datasets/lmdb/ref-zom/test.lmdb
+  mask_root: /data2/projects/chaeyun/VerbCentric_RIS/datasets/masks/ref-zom
+TRAIN:
+  swin_type: base
+  swin_pretrain: ckpts/swin_base_patch4_window12_384_22k.pth
+  bert: bert-base-uncased
+  mha: '8-8-8-8'
+  input_size: 480
+  word_len: 20
+  word_dim: 768
+  vis_dim: 512
+  num_token: 2
+  token_dim: 512
+  sync_bn: True
+  dropout: 0.
+  fusion_drop: 0.
+  workers: 32  # data loader workers
+  workers_val: 8
+  batch_size: 64  # batch size for training
+  batch_size_val: 16  # batch size for validation during training, memory and speed tradeoff
+  start_epoch: 0
+  epochs: 50
+  lr_backbone: 5.e-5
+  lr_text_encoder: 5.e-5
+  lr: 1.e-4
+  weight_decay: 1.e-4
+  amsgrad: True
+  manual_seed:
+  print_freq: 100
+  exp_name: cgformer_test
+  output_folder: /data/seunghoon/CGFormer/exp/seunghoon
+  save_freq: 1
+  weight:
+  resume:
+  evaluate: True
+  metric_learning: False
+  exclude_multiobj: True
+  metric_mode: hardpos_only_refined
+  metric_loss_weight: 0.1
+  loss_option: ACE_verbonly
+  margin_value: 12
+  temperature: 0.07
+  hp_selection: strict
+  filter_threshold: 0.5
+  mixup_lasttwo : False
+Distributed:
+  # dist_url: tcp://localhost:18123
+  dist_backend: 'nccl'
+  # multiprocessing_distributed: True
+  world_size: 1
+  # rank: 0
+TEST:
+  window12: True # if use window12 pretrained for training, testing set true
+  test_split: test
+  test_lmdb: /data2/projects/chaeyun/VerbCentric_RIS/datasets/lmdb/ref-zom/test.lmdb
+  visualize: False

CGFormer/config/config_refzom_repro_eval.yaml ADDED Viewed

	@@ -0,0 +1,62 @@

+DATA:
+  dataset: ref-zom
+  train_split: train
+  train_lmdb: /data2/projects/chaeyun/VerbCentric_RIS/datasets/lmdb/ref-zom/train.lmdb
+  val_split: test
+  val_lmdb: /data2/projects/chaeyun/VerbCentric_RIS/datasets/lmdb/ref-zom/test.lmdb
+  mask_root: /data2/projects/chaeyun/VerbCentric_RIS/datasets/masks/ref-zom
+TRAIN:
+  swin_type: base
+  swin_pretrain: ckpts/swin_base_patch4_window12_384_22k.pth
+  bert: bert-base-uncased
+  mha: '8-8-8-8'
+  input_size: 480
+  word_len: 20
+  word_dim: 768
+  vis_dim: 512
+  num_token: 2
+  token_dim: 512
+  sync_bn: True
+  dropout: 0.
+  fusion_drop: 0.
+  workers: 32  # data loader workers
+  workers_val: 8
+  batch_size: 64  # batch size for training
+  batch_size_val: 16  # batch size for validation during training, memory and speed tradeoff
+  start_epoch: 0
+  epochs: 50
+  lr_backbone: 5.e-5
+  lr_text_encoder: 5.e-5
+  lr: 1.e-4
+  weight_decay: 1.e-4
+  amsgrad: True
+  manual_seed:
+  print_freq: 100
+  exp_name: cgformer_test
+  output_folder: /data/seunghoon/CGFormer/exp/seunghoon
+  save_freq: 1
+  weight:
+  resume:
+  evaluate: True
+  metric_learning: False
+  exclude_multiobj: True
+  metric_mode: hardpos_only_refined
+  metric_loss_weight: 0.1
+  loss_option: ACE_verbonly
+  margin_value: 12
+  temperature: 0.07
+  hp_selection: strict
+  filter_threshold: 0.5
+  mixup_lasttwo : False
+Distributed:
+  # dist_url: tcp://localhost:18123
+  dist_backend: 'nccl'
+  # multiprocessing_distributed: True
+  world_size: 1
+  # rank: 0
+TEST:
+  window12: True # if use window12 pretrained for training, testing set true
+  test_split: test
+  test_lmdb: /data2/projects/chaeyun/VerbCentric_RIS/datasets/lmdb/ref-zom/test.lmdb
+  visualize: False

CGFormer/config/impl/config.yaml ADDED Viewed

	@@ -0,0 +1,53 @@

+DATA:
+  dataset: refcocog_u
+  train_split: train
+  train_lmdb: data/lmdb/refcocog_u/train.lmdb
+  val_split: val
+  val_lmdb: data/lmdb/refcocog_u/val.lmdb
+  mask_root: data/masks/refcocog_u
+AUG:
+  check: null
+TRAIN:
+  swin_type: base
+  swin_pretrain: ckpts/swin_base_patch4_window12_384_22k.pth
+  bert: bert-base-uncased
+  mha: '8-8-8-8'
+  input_size: 480
+  word_len: 20
+  word_dim: 768
+  vis_dim: 512
+  num_token: 2
+  token_dim: 512
+  sync_bn: True
+  dropout: 0.
+  fusion_drop: 0.
+  workers: 32  # data loader workers
+  workers_val: 8
+  batch_size: 32 #64  # batch size for training
+  batch_size_val: 16  # batch size for validation during training, memory and speed tradeoff
+  start_epoch: 0
+  epochs: 3
+  lr_backbone: 5.e-5
+  lr_text_encoder: 5.e-5
+  lr: 1.e-4
+  weight_decay: 1.e-4
+  amsgrad: True
+  manual_seed:
+  print_freq: 100
+  exp_name: cgformer
+  output_folder: exp/impl/
+  save_freq: 1
+  weight:
+  resume:
+  evaluate: True
+Distributed:
+#   dist_url: tcp://localhost:18123
+  dist_backend: 'nccl'
+#   multiprocessing_distributed: True
+  world_size: 1
+#   rank: 0
+TEST:
+  window12: True # if use window12 pretrained for training, testing set true
+  test_split: test
+  test_lmdb: data/lmdb/refcocog_u/test.lmdb
+  visualize: False

CGFormer/config/open.yaml ADDED Viewed

	@@ -0,0 +1,55 @@

+DATA:
+  dataset: refcoco
+  train_split: train_seen
+  train_lmdb: path/open_lmdb/refcoco/train_seen.lmdb
+  val_seen_split: val_seen
+  val_seen_lmdb: path/open_lmdb/refcoco/val_seen.lmdb
+  val_unseen_split: val_unseen
+  val_unseen_lmdb: path/open_lmdb/refcoco/val_unseen.lmdb
+  mask_root: path/masks/refcoco
+TRAIN:
+  swin_type: base
+  swin_pretrain: path/swin_base_patch4_window12_384_22k.pth
+  bert: bert-base-uncased
+  clip_pretrain: path/pretrain/ViT-L-14-336px.pt
+  mha: '8-8-8-8'
+  input_size: 480
+  clip_dim: 768
+  word_len: 20
+  num_token: 2
+  word_dim: 768
+  vis_dim: 512
+  token_dim: 512
+  sync_bn: True
+  dropout: 0.
+  fusion_drop: 0.
+  workers: 32  # data loader workers
+  workers_val: 8
+  batch_size: 64  # batch size for training
+  batch_size_val: 16  # batch size for validation during training, memory and speed tradeoff
+  start_epoch: 0
+  epochs: 1000
+  lr_backbone: 5.e-5
+  lr_text_encoder: 5.e-5
+  lr: 1.e-4
+  weight_decay: 1.e-4
+  amsgrad: True
+  manual_seed: 0
+  print_freq: 100
+  exp_name: open
+  output_folder: exp/refcoco
+  save_freq: 1
+  weight:
+  resume:
+  evaluate: True  # evaluate on validation set, extra gpu memory needed and small batch_size_val is recommend
+Distributed:
+  dist_url: tcp://localhost:12345
+  dist_backend: 'nccl'
+  multiprocessing_distributed: True
+  world_size: 1
+  rank: 0
+TEST:
+  window12: True # if use window12 pretrained for training, testing set true
+  test_split: test_unseen
+  test_lmdb: path/refcoco/test_unseen.lmdb
+  visualize: False

CGFormer/config/refcoco_mosaic/config.yaml ADDED Viewed

	@@ -0,0 +1,59 @@

+DATA:
+  dataset: refcoco
+  train_split: train
+  train_lmdb: data/lmdb/refcoco/train.lmdb
+  val_split: val
+  val_lmdb: data/lmdb/refcoco/val.lmdb
+  mask_root: data/masks/refcoco
+TRAIN:
+  swin_type: base
+  swin_pretrain: ckpts/swin_base_patch4_window12_384_22k.pth
+  bert: bert-base-uncased
+  mha: '8-8-8-8'
+  input_size: 480
+  word_len: 20
+  word_dim: 768
+  vis_dim: 512
+  num_token: 2
+  token_dim: 512
+  sync_bn: True
+  dropout: 0.
+  fusion_drop: 0.
+  workers: 16  # data loader workers
+  workers_val: 8
+  batch_size: 64 #batch size for training
+  batch_size_val: 16  # 16 batch size for validation during training, memory and speed tradeoff
+  start_epoch: 0
+  epochs: 50
+  lr_backbone: 5.e-5
+  lr_text_encoder: 5.e-5
+  lr: 1.e-4
+  weight_decay: 1.e-4
+  amsgrad: True
+  manual_seed:
+  print_freq: 100
+  exp_name: cgformer
+  output_folder: exp/refcoco_mosaic/
+  save_freq: 1
+  weight:
+  resume:
+  evaluate: True
+  aug:
+    num_bgs: 4
+    aug_prob: 0.6
+    tgt_selection: fixed
+    move_crs_pnt: False
+    blur: False
+Distributed:
+#   dist_url: tcp://localhost:18123
+  dist_backend: 'nccl'
+#   multiprocessing_distributed: True
+  world_size: 1
+#   rank: 0
+TEST:
+  window12: True # if use window12 pretrained for training, testing set true
+  test_split: val
+  test_lmdb: data/lmdb/refcoco/test.lmdb
+  visualize: False