Spaces:

lolout1
/

txstNeuroNest

Running

App Files Files Community

lolout1 commited on Sep 4

Commit

b4b0e11

verified ·

1 Parent(s): 021a4aa

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.dockerignore +6 -0
.gitattributes +10 -32
.gitignore +15 -0
Dockerfile +125 -0
Dockerfile1 +126 -0
NeuroNest/.gitattributes +35 -0
NeuroNest/README.md +12 -0
README.md +8 -5
app.py +110 -0
configs/.DS_Store +0 -0
configs/ade20k/Base-ADE20K-UnifiedSegmentation.yaml +68 -0
configs/ade20k/oneformer_R50_bs16_160k.yaml +58 -0
configs/ade20k/oneformer_dinat_large_IN21k_384_bs16_160k.yaml +42 -0
configs/ade20k/oneformer_swin_large_IN21k_384_bs16_160k.yaml +40 -0
configs/cityscapes/.DS_Store +0 -0
configs/cityscapes/Base-Cityscapes-UnifiedSegmentation.yaml +68 -0
configs/cityscapes/oneformer_R50_bs16_90k.yaml +59 -0
configs/cityscapes/oneformer_dinat_large_bs16_90k.yaml +22 -0
configs/cityscapes/oneformer_swin_large_IN21k_384_bs16_90k.yaml +20 -0
configs/coco/Base-COCO-UnifiedSegmentation.yaml +54 -0
configs/coco/oneformer_R50_bs16_50ep.yaml +59 -0
configs/coco/oneformer_dinat_large_bs16_100ep.yaml +22 -0
configs/coco/oneformer_swin_large_IN21k_384_bs16_100ep.yaml +25 -0
deform_setup.sh +21 -0
deform_setup_cpu.sh +11 -0
demo/colormap.py +170 -0
demo/defaults.py +77 -0
demo/predictor.py +190 -0
demo/visualizer.py +1350 -0
gradio_test.py +757 -0
install.py +19 -0
oneformer/.DS_Store +0 -0
oneformer/__init__.py +9 -0
oneformer/config.py +239 -0
oneformer/data/__init__.py +2 -0
oneformer/data/bpe_simple_vocab_16e6.txt +0 -0
oneformer/data/build.py +117 -0
oneformer/data/dataset_mappers/__init__.py +1 -0
oneformer/data/dataset_mappers/coco_unified_new_baseline_dataset_mapper.py +341 -0
oneformer/data/dataset_mappers/dataset_mapper.py +203 -0
oneformer/data/dataset_mappers/oneformer_unified_dataset_mapper.py +375 -0
oneformer/data/datasets/__init__.py +7 -0
oneformer/data/datasets/register_ade20k_instance.py +56 -0
oneformer/data/datasets/register_ade20k_panoptic.py +394 -0
oneformer/data/datasets/register_cityscapes_panoptic.py +199 -0
oneformer/data/datasets/register_coco_panoptic2instance.py +44 -0
oneformer/data/datasets/register_coco_panoptic_annos_semseg.py +367 -0
oneformer/data/tokenizer.py +200 -0
oneformer/evaluation/__init__.py +3 -0
oneformer/evaluation/cityscapes_evaluation.py +201 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,6 @@

+__pycache__
+*.pyc
+*.pyo
+*.pyd
+.git
+.gitignore

.gitattributes CHANGED Viewed

@@ -1,35 +1,13 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
 *.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
 *.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
 *.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

 *.pt filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
 *.safetensors filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.so filter=lfs diff=lfs merge=lfs -text
+*.o filter=lfs diff=lfs merge=lfs -text
+*.egg filter=lfs diff=lfs merge=lfs -text
+*.tar.gz filter=lfs diff=lfs merge=lfs -text
 *.tgz filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+output_floor_blackspot/model_0004999.pth filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,15 @@

+*.out
+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+slurm-*.out
+output_*/
+build/
+dist/
+*.egg-info/
+.env
+oneformer/modeling/pixel_decoder/ops/build/
+oneformer/modeling/pixel_decoder/ops/dist/
+oneformer/modeling/pixel_decoder/ops/*.egg-info/
+environment.yml

Dockerfile ADDED Viewed

	@@ -0,0 +1,125 @@

+FROM ubuntu:20.04
+ENV DEBIAN_FRONTEND=noninteractive
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    python3.8 \
+    python3.8-dev \
+    python3.8-distutils \
+    git \
+    wget \
+    curl \
+    build-essential \
+    libssl-dev \
+    libffi-dev \
+    libxml2-dev \
+    libxslt1-dev \
+    zlib1g-dev \
+    libsm6 \
+    libxext6 \
+    libxrender-dev \
+    libgl1-mesa-glx \
+    libglib2.0-0 \
+    libgomp1 \
+    ninja-build \
+    && rm -rf /var/lib/apt/lists/*
+# Set python3.8 as default
+RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.8 1 && \
+    update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1
+# Install pip for Python 3.8 specifically
+RUN curl https://bootstrap.pypa.io/pip/3.8/get-pip.py -o get-pip.py && \
+    python get-pip.py && \
+    rm get-pip.py
+# Upgrade pip and install build tools
+RUN python -m pip install --upgrade pip==23.0.1 setuptools==59.5.0 wheel cython
+# Create user for HF Spaces
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME=/home/user \
+    PATH=/home/user/.local/bin:$PATH
+WORKDIR /app
+# Install PyTorch 1.9 CPU first
+RUN pip install --user torch==1.9.0+cpu torchvision==0.10.0+cpu -f https://download.pytorch.org/whl/torch_stable.html
+# Install numpy first (required for other packages)
+RUN pip install --user numpy==1.21.6
+# Install core dependencies in order
+RUN pip install --user \
+    Pillow==8.3.2 \
+    opencv-python==4.5.5.64
+# Install scientific computing dependencies
+RUN pip install --user \
+    scipy==1.7.3 \
+    scikit-image==0.19.3 \
+    scikit-learn==1.0.2
+# Install detectron2 for PyTorch 1.9 CPU
+RUN pip install --user detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cpu/torch1.9/index.html
+# Install pycocotools separately to avoid compilation issues
+RUN pip install --user pycocotools --no-build-isolation
+# Install ML dependencies
+RUN pip install --user \
+    timm==0.4.12 \
+    einops==0.6.1 \
+    h5py==3.7.0 \
+    shapely==1.8.5 \
+    tqdm==4.64.1 \
+    imutils==0.5.4
+# Install web framework dependencies with compatible versions
+RUN pip install --user \
+    httpx==0.23.0 \
+    httpcore==0.15.0 \
+    anyio==3.6.1 \
+    starlette==0.19.1 \
+    fastapi==0.78.0 \
+    uvicorn==0.18.2
+# Install Gradio and HuggingFace Hub with compatible versions
+# These versions are compatible with each other and the rest of the stack
+RUN pip install --user \
+    huggingface_hub==0.17.3 \
+    gradio==3.50.2
+# Try to install NATTEN (optional)
+RUN pip install --user natten==0.14.6 -f https://shi-labs.com/natten/wheels/cpu/torch1.9/index.html || \
+    echo "NATTEN installation failed - continuing without it"
+# Install remaining dependencies
+RUN pip install --user \
+    PyYAML==5.4.1 \
+    matplotlib==3.5.3 \
+    regex==2022.10.31 \
+    ftfy==6.1.1 \
+    wandb \
+    diffdist \
+    inflect==6.0.4 \
+    gdown==4.5.4 \
+    wget==3.2
+# Copy application files
+COPY --chown=user:user . /app
+# Set environment variables for CPU-only operation
+ENV CUDA_VISIBLE_DEVICES=""
+ENV FORCE_CUDA="0"
+ENV OMP_NUM_THREADS=4
+ENV MKL_NUM_THREADS=4
+ENV PYTHONUNBUFFERED=1
+# Expose port for Gradio
+EXPOSE 7860
+# Run the application
+CMD ["python", "app.py"]

Dockerfile1 ADDED Viewed

	@@ -0,0 +1,126 @@

+FROM ubuntu:20.04
+ENV DEBIAN_FRONTEND=noninteractive
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    python3.8 \
+    python3.8-dev \
+    python3.8-distutils \
+    git \
+    wget \
+    curl \
+    build-essential \
+    libssl-dev \
+    libffi-dev \
+    libxml2-dev \
+    libxslt1-dev \
+    zlib1g-dev \
+    libsm6 \
+    libxext6 \
+    libxrender-dev \
+    libgl1-mesa-glx \
+    libglib2.0-0 \
+    libgomp1 \
+    ninja-build \
+    && rm -rf /var/lib/apt/lists/*
+# Set python3.8 as default
+RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.8 1 && \
+    update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.8 1
+# Install pip for Python 3.8 specifically
+RUN curl https://bootstrap.pypa.io/pip/3.8/get-pip.py -o get-pip.py && \
+    python get-pip.py && \
+    rm get-pip.py
+# Upgrade pip and install build tools
+RUN python -m pip install --upgrade pip==23.0.1 setuptools==59.5.0 wheel cython
+# Create user for HF Spaces
+RUN useradd -m -u 1000 user
+USER user
+ENV HOME=/home/user \
+    PATH=/home/user/.local/bin:$PATH
+WORKDIR /app
+# Install PyTorch 1.9 CPU first
+RUN pip install --user torch==1.9.0+cpu torchvision==0.10.0+cpu -f https://download.pytorch.org/whl/torch_stable.html
+# Install numpy first (required for other packages)
+RUN pip install --user numpy==1.21.6
+# Install core dependencies in order
+RUN pip install --user \
+    Pillow==8.3.2 \
+    opencv-python==4.5.5.64
+# Install scientific computing dependencies
+RUN pip install --user \
+    scipy==1.7.3 \
+    scikit-image==0.19.3 \
+    scikit-learn==1.0.2
+# Install detectron2 for PyTorch 1.9 CPU
+RUN pip install --user detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cpu/torch1.9/index.html
+# Install pycocotools separately to avoid compilation issues
+RUN pip install --user pycocotools --no-build-isolation
+# Install ML dependencies
+RUN pip install --user \
+    timm==0.4.12 \
+    einops==0.6.1 \
+    h5py==3.7.0 \
+    shapely==1.8.5 \
+    tqdm==4.64.1 \
+    imutils==0.5.4
+# Install web framework dependencies with compatible versions
+RUN pip install --user \
+    httpx==0.23.0 \
+    httpcore==0.15.0 \
+    anyio==3.6.1 \
+    starlette==0.19.1 \
+    fastapi==0.78.0 \
+    uvicorn==0.18.2
+# Install Gradio and HuggingFace Hub with compatible versions
+# These versions are compatible with each other and the rest of the stack
+RUN pip install --user \
+    huggingface_hub==0.17.3 \
+    gradio==3.50.2
+# Try to install NATTEN (optional)
+#RUN pip install --user natten
+#==0.14.6 -f https://shi-labs.com/nttenatten/wheels/cpu/torch1.9/index.html || \
+    echo "NATTEN installation failed - continuing without it"
+# Install remaining dependencies
+RUN pip install --user \
+    PyYAML==5.4.1 \
+    matplotlib==3.5.3 \
+    regex==2022.10.31 \
+    ftfy==6.1.1 \
+    wandb \
+    diffdist \
+    inflect==6.0.4 \
+    gdown==4.5.4 \
+    wget==3.2
+# Copy application files
+COPY --chown=user:user . /app
+# Set environment variables for CPU-only operation
+ENV CUDA_VISIBLE_DEVICES=""
+ENV FORCE_CUDA="0"
+ENV OMP_NUM_THREADS=4
+ENV MKL_NUM_THREADS=4
+ENV PYTHONUNBUFFERED=1
+# Expose port for Gradio
+EXPOSE 7860
+# Run the application
+CMD ["python", "app.py"]

NeuroNest/.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

NeuroNest/README.md ADDED Viewed

	@@ -0,0 +1,12 @@

+---
+title: NeuroNest
+emoji: 🏆
+colorFrom: indigo
+colorTo: green
+sdk: gradio
+sdk_version: 5.30.0
+app_file: app.py
+pinned: false
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

README.md CHANGED Viewed

@@ -1,12 +1,15 @@
 ---
 title: NeuroNesttxst
-emoji: 🐨
-colorFrom: purple
-colorTo: blue
-sdk: gradio
-sdk_version: 5.44.1
 app_file: app.py
 pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: NeuroNesttxst
+emoji: 🧠
+colorFrom: blue
+colorTo: purple
+sdk: docker
 app_file: app.py
 pinned: false
 ---
+# NeuroNest OneFormer
+OneFormer: One Transformer to Rule Universal Image Segmentation
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,110 @@

+#!/usr/bin/env python3
+"""
+NeuroNest Application Entry Point
+Handles initialization and graceful startup for Hugging Face Spaces
+"""
+import os
+import sys
+import logging
+import time
+from pathlib import Path
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+# Set environment variables
+os.environ['CUDA_VISIBLE_DEVICES'] = ''
+os.environ['FORCE_CUDA'] = '0'
+def setup_oneformer_imports():
+    """Add OneFormer to Python path if needed"""
+    oneformer_path = Path(__file__).parent / "oneformer"
+    if oneformer_path.exists() and str(oneformer_path) not in sys.path:
+        sys.path.insert(0, str(oneformer_path))
+        logger.info(f"Added OneFormer to path: {oneformer_path}")
+def check_dependencies():
+    """Check if all required dependencies are available"""
+    try:
+        import torch
+        logger.info(f"PyTorch version: {torch.__version__}")
+        logger.info(f"CUDA available: {torch.cuda.is_available()}")
+        # Verify torch version
+        if not torch.__version__.startswith('1.9'):
+            logger.warning(f"Expected PyTorch 1.9.x, got {torch.__version__}")
+        import detectron2
+        logger.info(f"Detectron2 version: {detectron2.__version__}")
+        import gradio as gr
+        logger.info(f"Gradio version: {gr.__version__}")
+        import cv2
+        logger.info(f"OpenCV version: {cv2.__version__}")
+        import PIL
+        logger.info(f"Pillow version: {PIL.__version__}")
+        # Check PIL compatibility
+        if hasattr(PIL.Image, 'LINEAR'):
+            logger.info("PIL has LINEAR attribute")
+        elif hasattr(PIL.Image, 'BILINEAR'):
+            logger.info("PIL has BILINEAR attribute (newer version)")
+            # Monkey patch for compatibility
+            PIL.Image.LINEAR = PIL.Image.BILINEAR
+            logger.info("Applied PIL compatibility patch")
+        # Check numpy version
+        import numpy as np
+        logger.info(f"NumPy version: {np.__version__}")
+        return True
+    except ImportError as e:
+        logger.error(f"Missing dependency: {e}")
+        return False
+def main():
+    """Main application entry point"""
+    print("=" * 50)
+    print(f"NeuroNest Application Startup")
+    print(f"Time: {time.strftime('%Y-%m-%d %H:%M:%S')}")
+    print("=" * 50)
+    # Setup paths
+    setup_oneformer_imports()
+    # Check dependencies
+    if not check_dependencies():
+        logger.error("Dependency check failed")
+        sys.exit(1)
+    try:
+        # Import and launch the Gradio interface
+        from gradio_test import create_gradio_interface
+        logger.info("Creating Gradio interface...")
+        interface = create_gradio_interface()
+        logger.info("Launching application...")
+        interface.queue(max_size=10).launch(
+            server_name="0.0.0.0",
+            server_port=7860,
+            share=True,
+            debug = True
+        )
+    except Exception as e:
+        logger.error(f"Error launching app: {e}")
+        import traceback
+        traceback.print_exc()
+        sys.exit(1)
+if __name__ == "__main__":
+    main()

configs/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

configs/ade20k/Base-ADE20K-UnifiedSegmentation.yaml ADDED Viewed

	@@ -0,0 +1,68 @@

+MODEL:
+  BACKBONE:
+    FREEZE_AT: 0
+    NAME: "build_resnet_backbone"
+  WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
+  PIXEL_MEAN: [123.675, 116.280, 103.530]
+  PIXEL_STD: [58.395, 57.120, 57.375]
+  RESNETS:
+    DEPTH: 50
+    STEM_TYPE: "basic"  # not used
+    STEM_OUT_CHANNELS: 64
+    STRIDE_IN_1X1: False
+    OUT_FEATURES: ["res2", "res3", "res4", "res5"]
+    # NORM: "SyncBN"
+    RES5_MULTI_GRID: [1, 1, 1]  # not used
+DATASETS:
+  TRAIN: ("ade20k_panoptic_train",)
+  TEST_PANOPTIC: ("ade20k_panoptic_val",)
+  TEST_INSTANCE: ("ade20k_instance_val",)
+  TEST_SEMANTIC: ("ade20k_sem_seg_val",)
+SOLVER:
+  IMS_PER_BATCH: 16
+  BASE_LR: 0.0001
+  MAX_ITER: 160000
+  WARMUP_FACTOR: 1.0
+  WARMUP_ITERS: 0
+  WEIGHT_DECAY: 0.05
+  OPTIMIZER: "ADAMW"
+  LR_SCHEDULER_NAME: "WarmupPolyLR"
+  BACKBONE_MULTIPLIER: 0.1
+  CLIP_GRADIENTS:
+    ENABLED: True
+    CLIP_TYPE: "full_model"
+    CLIP_VALUE: 0.01
+    NORM_TYPE: 2.0
+  AMP:
+    ENABLED: True
+INPUT:
+  MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 512) for x in range(5, 21)]"]
+  MIN_SIZE_TRAIN_SAMPLING: "choice"
+  MIN_SIZE_TEST: 512
+  MAX_SIZE_TRAIN: 2048
+  MAX_SIZE_TEST: 2048
+  CROP:
+    ENABLED: True
+    TYPE: "absolute"
+    SIZE: (512, 512)
+    SINGLE_CATEGORY_MAX_AREA: 1.0
+  COLOR_AUG_SSD: True
+  SIZE_DIVISIBILITY: 512  # used in dataset mapper
+  FORMAT: "RGB"
+  DATASET_MAPPER_NAME: "oneformer_unified"
+  MAX_SEQ_LEN: 77
+  TASK_SEQ_LEN: 77
+  TASK_PROB:
+    SEMANTIC: 0.33
+    INSTANCE: 0.66
+TEST:
+  EVAL_PERIOD: 5000
+  AUG:
+    ENABLED: False
+    MIN_SIZES: [256, 384, 512, 640, 768, 896]
+    MAX_SIZE: 3584
+    FLIP: True
+DATALOADER:
+  FILTER_EMPTY_ANNOTATIONS: True
+  NUM_WORKERS: 4
+VERSION: 2

configs/ade20k/oneformer_R50_bs16_160k.yaml ADDED Viewed

	@@ -0,0 +1,58 @@

+_BASE_: Base-ADE20K-UnifiedSegmentation.yaml
+MODEL:
+  META_ARCHITECTURE: "OneFormer"
+  SEM_SEG_HEAD:
+    NAME: "OneFormerHead"
+    IGNORE_VALUE: 255
+    NUM_CLASSES: 150
+    LOSS_WEIGHT: 1.0
+    CONVS_DIM: 256
+    MASK_DIM: 256
+    NORM: "GN"
+    # pixel decoder
+    PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
+    IN_FEATURES: ["res2", "res3", "res4", "res5"]
+    DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
+    COMMON_STRIDE: 4
+    TRANSFORMER_ENC_LAYERS: 6
+  ONE_FORMER:
+    TRANSFORMER_DECODER_NAME: "ContrastiveMultiScaleMaskedTransformerDecoder"
+    TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
+    DEEP_SUPERVISION: True
+    NO_OBJECT_WEIGHT: 0.1
+    CLASS_WEIGHT: 2.0
+    MASK_WEIGHT: 5.0
+    DICE_WEIGHT: 5.0
+    CONTRASTIVE_WEIGHT: 0.5
+    CONTRASTIVE_TEMPERATURE: 0.07
+    HIDDEN_DIM: 256
+    NUM_OBJECT_QUERIES: 150
+    USE_TASK_NORM: True
+    NHEADS: 8
+    DROPOUT: 0.1
+    DIM_FEEDFORWARD: 2048
+    ENC_LAYERS: 0
+    PRE_NORM: False
+    ENFORCE_INPUT_PROJ: False
+    SIZE_DIVISIBILITY: 32
+    CLASS_DEC_LAYERS: 2
+    DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
+    TRAIN_NUM_POINTS: 12544
+    OVERSAMPLE_RATIO: 3.0
+    IMPORTANCE_SAMPLE_RATIO: 0.75
+  TEXT_ENCODER:
+    WIDTH: 256
+    CONTEXT_LENGTH: 77
+    NUM_LAYERS: 6
+    VOCAB_SIZE: 49408
+    PROJ_NUM_LAYERS: 2
+    N_CTX: 16
+  TEST:
+    SEMANTIC_ON: True
+    INSTANCE_ON: True
+    PANOPTIC_ON: True
+    OVERLAP_THRESHOLD: 0.8
+    OBJECT_MASK_THRESHOLD: 0.8
+    TASK: "panoptic"
+TEST:
+  DETECTIONS_PER_IMAGE: 150

configs/ade20k/oneformer_dinat_large_IN21k_384_bs16_160k.yaml ADDED Viewed

	@@ -0,0 +1,42 @@

+_BASE_: oneformer_R50_bs16_160k.yaml
+MODEL:
+  BACKBONE:
+    NAME: "D2DiNAT"
+  DiNAT:
+    EMBED_DIM: 192
+    MLP_RATIO: 2.0
+    DEPTHS: [3, 4, 18, 5]
+    NUM_HEADS: [6, 12, 24, 48]
+    KERNEL_SIZE: 11
+    DROP_PATH_RATE: 0.3
+    DILATIONS: [[1, 20, 1], [1, 5, 1, 10], [1, 2, 1, 3, 1, 4, 1, 5, 1, 2, 1, 3, 1, 4, 1, 5, 1, 5], [1, 2, 1, 2, 1]]
+  WEIGHTS: "dinat_large_in22k_in1k_384_11x11.pkl"
+  PIXEL_MEAN: [123.675, 116.280, 103.530]
+  PIXEL_STD: [58.395, 57.120, 57.375]
+  ONE_FORMER:
+    NUM_OBJECT_QUERIES: 250
+SOLVER:
+  AMP:
+    ENABLED: False
+INPUT:
+  MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"]
+  MIN_SIZE_TRAIN_SAMPLING: "choice"
+  MIN_SIZE_TEST: 640
+  MAX_SIZE_TRAIN: 2560
+  MAX_SIZE_TEST: 2560
+  CROP:
+    ENABLED: True
+    TYPE: "absolute"
+    SIZE: (640, 640)
+    SINGLE_CATEGORY_MAX_AREA: 1.0
+  COLOR_AUG_SSD: True
+  SIZE_DIVISIBILITY: 640  # used in dataset mapper
+  FORMAT: "RGB"
+TEST:
+  DETECTIONS_PER_IMAGE: 250
+  EVAL_PERIOD: 5000
+  AUG:
+    ENABLED: False
+    MIN_SIZES: [320, 480, 640, 800, 960, 1120]
+    MAX_SIZE: 4480
+    FLIP: True

configs/ade20k/oneformer_swin_large_IN21k_384_bs16_160k.yaml ADDED Viewed

	@@ -0,0 +1,40 @@

+_BASE_: oneformer_R50_bs16_160k.yaml
+MODEL:
+  BACKBONE:
+    NAME: "D2SwinTransformer"
+  SWIN:
+    EMBED_DIM: 192
+    DEPTHS: [2, 2, 18, 2]
+    NUM_HEADS: [6, 12, 24, 48]
+    WINDOW_SIZE: 12
+    APE: False
+    DROP_PATH_RATE: 0.3
+    PATCH_NORM: True
+    PRETRAIN_IMG_SIZE: 384
+  WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
+  PIXEL_MEAN: [123.675, 116.280, 103.530]
+  PIXEL_STD: [58.395, 57.120, 57.375]
+  ONE_FORMER:
+    NUM_OBJECT_QUERIES: 250
+INPUT:
+  MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 640) for x in range(5, 21)]"]
+  MIN_SIZE_TRAIN_SAMPLING: "choice"
+  MIN_SIZE_TEST: 640
+  MAX_SIZE_TRAIN: 2560
+  MAX_SIZE_TEST: 2560
+  CROP:
+    ENABLED: True
+    TYPE: "absolute"
+    SIZE: (640, 640)
+    SINGLE_CATEGORY_MAX_AREA: 1.0
+  COLOR_AUG_SSD: True
+  SIZE_DIVISIBILITY: 640  # used in dataset mapper
+  FORMAT: "RGB"
+TEST:
+  DETECTIONS_PER_IMAGE: 250
+  EVAL_PERIOD: 5000
+  AUG:
+    ENABLED: False
+    MIN_SIZES: [320, 480, 640, 800, 960, 1120]
+    MAX_SIZE: 4480
+    FLIP: True

configs/cityscapes/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

configs/cityscapes/Base-Cityscapes-UnifiedSegmentation.yaml ADDED Viewed

	@@ -0,0 +1,68 @@

+MODEL:
+  BACKBONE:
+    FREEZE_AT: 0
+    NAME: "build_resnet_backbone"
+  WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
+  PIXEL_MEAN: [123.675, 116.280, 103.530]
+  PIXEL_STD: [58.395, 57.120, 57.375]
+  RESNETS:
+    DEPTH: 50
+    STEM_TYPE: "basic"  # not used
+    STEM_OUT_CHANNELS: 64
+    STRIDE_IN_1X1: False
+    OUT_FEATURES: ["res2", "res3", "res4", "res5"]
+    NORM: "SyncBN"  # use syncbn for cityscapes dataset
+    RES5_MULTI_GRID: [1, 1, 1]  # not used
+DATASETS:
+  TRAIN: ("cityscapes_fine_panoptic_train",)
+  TEST_PANOPTIC: ("cityscapes_fine_panoptic_val",)
+  TEST_INSTANCE: ("cityscapes_fine_instance_seg_val",)
+  TEST_SEMANTIC: ("cityscapes_fine_sem_seg_val",)
+SOLVER:
+  IMS_PER_BATCH: 16
+  BASE_LR: 0.0001
+  MAX_ITER: 90000
+  WARMUP_FACTOR: 1.0
+  WARMUP_ITERS: 0
+  WEIGHT_DECAY: 0.05
+  OPTIMIZER: "ADAMW"
+  LR_SCHEDULER_NAME: "WarmupPolyLR"
+  BACKBONE_MULTIPLIER: 0.1
+  CLIP_GRADIENTS:
+    ENABLED: True
+    CLIP_TYPE: "full_model"
+    CLIP_VALUE: 0.01
+    NORM_TYPE: 2.0
+  AMP:
+    ENABLED: True
+INPUT:
+  MIN_SIZE_TRAIN: !!python/object/apply:eval ["[int(x * 0.1 * 1024) for x in range(5, 21)]"]
+  MIN_SIZE_TRAIN_SAMPLING: "choice"
+  MIN_SIZE_TEST: 1024
+  MAX_SIZE_TRAIN: 4096
+  MAX_SIZE_TEST: 2048
+  CROP:
+    ENABLED: True
+    TYPE: "absolute"
+    SIZE: (512, 1024)
+    SINGLE_CATEGORY_MAX_AREA: 1.0
+  COLOR_AUG_SSD: True
+  SIZE_DIVISIBILITY: -1
+  FORMAT: "RGB"
+  DATASET_MAPPER_NAME: "oneformer_unified"
+  MAX_SEQ_LEN: 77
+  TASK_SEQ_LEN: 77
+  TASK_PROB:
+    SEMANTIC: 0.33
+    INSTANCE: 0.66
+TEST:
+  EVAL_PERIOD: 5000
+  AUG:
+    ENABLED: False
+    MIN_SIZES: [512, 768, 1024, 1280, 1536, 1792]
+    MAX_SIZE: 4096
+    FLIP: True
+DATALOADER:
+  FILTER_EMPTY_ANNOTATIONS: True
+  NUM_WORKERS: 4
+VERSION: 2

configs/cityscapes/oneformer_R50_bs16_90k.yaml ADDED Viewed

	@@ -0,0 +1,59 @@

+_BASE_: Base-Cityscapes-UnifiedSegmentation.yaml
+MODEL:
+  META_ARCHITECTURE: "OneFormer"
+  SEM_SEG_HEAD:
+    NAME: "OneFormerHead"
+    IGNORE_VALUE: 255
+    NUM_CLASSES: 19
+    LOSS_WEIGHT: 1.0
+    CONVS_DIM: 256
+    MASK_DIM: 256
+    NORM: "GN"
+    # pixel decoder
+    PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
+    IN_FEATURES: ["res2", "res3", "res4", "res5"]
+    DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
+    COMMON_STRIDE: 4
+    TRANSFORMER_ENC_LAYERS: 6
+  ONE_FORMER:
+    TRANSFORMER_DECODER_NAME: "ContrastiveMultiScaleMaskedTransformerDecoder"
+    TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
+    DEEP_SUPERVISION: True
+    NO_OBJECT_WEIGHT: 0.1
+    CLASS_WEIGHT: 2.0
+    MASK_WEIGHT: 5.0
+    DICE_WEIGHT: 5.0
+    CONTRASTIVE_WEIGHT: 0.5
+    CONTRASTIVE_TEMPERATURE: 0.07
+    HIDDEN_DIM: 256
+    NUM_OBJECT_QUERIES: 150
+    USE_TASK_NORM: True
+    NHEADS: 8
+    DROPOUT: 0.1
+    DIM_FEEDFORWARD: 2048
+    ENC_LAYERS: 0
+    PRE_NORM: False
+    ENFORCE_INPUT_PROJ: False
+    SIZE_DIVISIBILITY: 32
+    ENC_LAYERS: 0
+    CLASS_DEC_LAYERS: 2
+    DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
+    TRAIN_NUM_POINTS: 12544
+    OVERSAMPLE_RATIO: 3.0
+    IMPORTANCE_SAMPLE_RATIO: 0.75
+  TEXT_ENCODER:
+    WIDTH: 256
+    CONTEXT_LENGTH: 77
+    NUM_LAYERS: 6
+    VOCAB_SIZE: 49408
+    PROJ_NUM_LAYERS: 2
+    N_CTX: 16
+  TEST:
+    SEMANTIC_ON: True
+    INSTANCE_ON: True
+    PANOPTIC_ON: True
+    OVERLAP_THRESHOLD: 0.8
+    OBJECT_MASK_THRESHOLD: 0.8
+    TASK: "panoptic"
+TEST:
+  DETECTIONS_PER_IMAGE: 150

configs/cityscapes/oneformer_dinat_large_bs16_90k.yaml ADDED Viewed

	@@ -0,0 +1,22 @@

+_BASE_: oneformer_R50_bs16_90k.yaml
+MODEL:
+  BACKBONE:
+    NAME: "D2DiNAT"
+  DiNAT:
+    EMBED_DIM: 192
+    MLP_RATIO: 2.0
+    DEPTHS: [3, 4, 18, 5]
+    NUM_HEADS: [6, 12, 24, 48]
+    KERNEL_SIZE: 7
+    DROP_PATH_RATE: 0.3
+    DILATIONS: [[1, 18, 1], [1, 5, 1, 9], [1, 2, 1, 3, 1, 4, 1, 2, 1, 3, 1, 4, 1, 2, 1, 3, 1, 4], [1, 2, 1, 2, 1]]
+  WEIGHTS: "dinat_large_in22k_224.pkl"
+  PIXEL_MEAN: [123.675, 116.280, 103.530]
+  PIXEL_STD: [58.395, 57.120, 57.375]
+  ONE_FORMER:
+    NUM_OBJECT_QUERIES: 250
+SOLVER:
+  AMP:
+    ENABLED: False
+TEST:
+  DETECTIONS_PER_IMAGE: 250

configs/cityscapes/oneformer_swin_large_IN21k_384_bs16_90k.yaml ADDED Viewed

	@@ -0,0 +1,20 @@

+_BASE_: oneformer_R50_bs16_90k.yaml
+MODEL:
+  BACKBONE:
+    NAME: "D2SwinTransformer"
+  SWIN:
+    EMBED_DIM: 192
+    DEPTHS: [2, 2, 18, 2]
+    NUM_HEADS: [6, 12, 24, 48]
+    WINDOW_SIZE: 12
+    APE: False
+    DROP_PATH_RATE: 0.3
+    PATCH_NORM: True
+    PRETRAIN_IMG_SIZE: 384
+  WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
+  PIXEL_MEAN: [123.675, 116.280, 103.530]
+  PIXEL_STD: [58.395, 57.120, 57.375]
+  ONE_FORMER:
+    NUM_OBJECT_QUERIES: 250
+TEST:
+  DETECTIONS_PER_IMAGE: 250

configs/coco/Base-COCO-UnifiedSegmentation.yaml ADDED Viewed

	@@ -0,0 +1,54 @@

+MODEL:
+  BACKBONE:
+    FREEZE_AT: 0
+    NAME: "build_resnet_backbone"
+  WEIGHTS: "detectron2://ImageNetPretrained/torchvision/R-50.pkl"
+  PIXEL_MEAN: [123.675, 116.280, 103.530]
+  PIXEL_STD: [58.395, 57.120, 57.375]
+  RESNETS:
+    DEPTH: 50
+    STEM_TYPE: "basic"  # not used
+    STEM_OUT_CHANNELS: 64
+    STRIDE_IN_1X1: False
+    OUT_FEATURES: ["res2", "res3", "res4", "res5"]
+    # NORM: "SyncBN"
+    RES5_MULTI_GRID: [1, 1, 1]  # not used
+DATASETS:
+  TRAIN: ("coco_2017_train_panoptic_with_sem_seg",)
+  TEST_PANOPTIC: ("coco_2017_val_panoptic_with_sem_seg",)  # to evaluate instance and semantic performance as well
+  TEST_INSTANCE: ("coco_2017_val",)
+  TEST_SEMANTIC: ("coco_2017_val_panoptic_with_sem_seg",)
+SOLVER:
+  IMS_PER_BATCH: 16
+  BASE_LR: 0.0001
+  STEPS: (327778, 355092)
+  MAX_ITER: 368750
+  WARMUP_FACTOR: 1.0
+  WARMUP_ITERS: 10
+  WEIGHT_DECAY: 0.05
+  OPTIMIZER: "ADAMW"
+  BACKBONE_MULTIPLIER: 0.1
+  CLIP_GRADIENTS:
+    ENABLED: True
+    CLIP_TYPE: "full_model"
+    CLIP_VALUE: 0.01
+    NORM_TYPE: 2.0
+  AMP:
+    ENABLED: True
+INPUT:
+  IMAGE_SIZE: 1024
+  MIN_SCALE: 0.1
+  MAX_SCALE: 2.0
+  FORMAT: "RGB"
+  DATASET_MAPPER_NAME: "coco_unified_lsj"
+  MAX_SEQ_LEN: 77
+  TASK_SEQ_LEN: 77
+  TASK_PROB:
+    SEMANTIC: 0.33
+    INSTANCE: 0.66
+TEST:
+  EVAL_PERIOD: 5000
+DATALOADER:
+  FILTER_EMPTY_ANNOTATIONS: True
+  NUM_WORKERS: 4
+VERSION: 2

configs/coco/oneformer_R50_bs16_50ep.yaml ADDED Viewed

	@@ -0,0 +1,59 @@

+_BASE_: Base-COCO-UnifiedSegmentation.yaml
+MODEL:
+  META_ARCHITECTURE: "OneFormer"
+  SEM_SEG_HEAD:
+    NAME: "OneFormerHead"
+    IGNORE_VALUE: 255
+    NUM_CLASSES: 133
+    LOSS_WEIGHT: 1.0
+    CONVS_DIM: 256
+    MASK_DIM: 256
+    NORM: "GN"
+    # pixel decoder
+    PIXEL_DECODER_NAME: "MSDeformAttnPixelDecoder"
+    IN_FEATURES: ["res2", "res3", "res4", "res5"]
+    DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
+    COMMON_STRIDE: 4
+    TRANSFORMER_ENC_LAYERS: 6
+  ONE_FORMER:
+    TRANSFORMER_DECODER_NAME: "ContrastiveMultiScaleMaskedTransformerDecoder"
+    TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
+    DEEP_SUPERVISION: True
+    NO_OBJECT_WEIGHT: 0.1
+    CLASS_WEIGHT: 2.0
+    MASK_WEIGHT: 5.0
+    DICE_WEIGHT: 5.0
+    CONTRASTIVE_WEIGHT: 0.5
+    CONTRASTIVE_TEMPERATURE: 0.07
+    HIDDEN_DIM: 256
+    NUM_OBJECT_QUERIES: 150
+    USE_TASK_NORM: True
+    NHEADS: 8
+    DROPOUT: 0.1
+    DIM_FEEDFORWARD: 2048
+    ENC_LAYERS: 0
+    PRE_NORM: False
+    ENFORCE_INPUT_PROJ: False
+    SIZE_DIVISIBILITY: 32
+    CLASS_DEC_LAYERS: 2
+    DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
+    TRAIN_NUM_POINTS: 12544
+    OVERSAMPLE_RATIO: 3.0
+    IMPORTANCE_SAMPLE_RATIO: 0.75
+  TEXT_ENCODER:
+    WIDTH: 256
+    CONTEXT_LENGTH: 77
+    NUM_LAYERS: 6
+    VOCAB_SIZE: 49408
+    PROJ_NUM_LAYERS: 2
+    N_CTX: 16
+  TEST:
+    SEMANTIC_ON: True
+    INSTANCE_ON: True
+    PANOPTIC_ON: True
+    DETECTION_ON: False
+    OVERLAP_THRESHOLD: 0.8
+    OBJECT_MASK_THRESHOLD: 0.8
+    TASK: "panoptic"
+TEST:
+  DETECTIONS_PER_IMAGE: 150

configs/coco/oneformer_dinat_large_bs16_100ep.yaml ADDED Viewed

	@@ -0,0 +1,22 @@

+_BASE_: oneformer_R50_bs16_50ep.yaml
+MODEL:
+  BACKBONE:
+    NAME: "D2DiNAT"
+  DiNAT:
+    EMBED_DIM: 192
+    MLP_RATIO: 2.0
+    DEPTHS: [3, 4, 18, 5]
+    NUM_HEADS: [6, 12, 24, 48]
+    KERNEL_SIZE: 11
+    DROP_PATH_RATE: 0.3
+    DILATIONS: [[1, 20, 1], [1, 5, 1, 10], [1, 2, 1, 3, 1, 4, 1, 5, 1, 2, 1, 3, 1, 4, 1, 5, 1, 5], [1, 2, 1, 2, 1]]
+  WEIGHTS: "dinat_large_in22k_in1k_384_11x11.pkl"
+  PIXEL_MEAN: [123.675, 116.280, 103.530]
+  PIXEL_STD: [58.395, 57.120, 57.375]
+  ONE_FORMER:
+    NUM_OBJECT_QUERIES: 150
+SOLVER:
+  STEPS: (655556, 710184)
+  MAX_ITER: 737500
+TEST:
+  DETECTIONS_PER_IMAGE: 150

configs/coco/oneformer_swin_large_IN21k_384_bs16_100ep.yaml ADDED Viewed

	@@ -0,0 +1,25 @@

+_BASE_: oneformer_R50_bs16_50ep.yaml
+MODEL:
+  BACKBONE:
+    NAME: "D2SwinTransformer"
+  SWIN:
+    EMBED_DIM: 192
+    DEPTHS: [2, 2, 18, 2]
+    NUM_HEADS: [6, 12, 24, 48]
+    WINDOW_SIZE: 12
+    APE: False
+    DROP_PATH_RATE: 0.3
+    PATCH_NORM: True
+    PRETRAIN_IMG_SIZE: 384
+  WEIGHTS: "swin_large_patch4_window12_384_22k.pkl"
+  PIXEL_MEAN: [123.675, 116.280, 103.530]
+  PIXEL_STD: [58.395, 57.120, 57.375]
+  ONE_FORMER:
+    NUM_OBJECT_QUERIES: 150
+SOLVER:
+  STEPS: (655556, 735184)
+  MAX_ITER: 737500
+  AMP:
+    ENABLED: False
+TEST:
+  DETECTIONS_PER_IMAGE: 150

deform_setup.sh ADDED Viewed

	@@ -0,0 +1,21 @@

+#!/usr/bin/env bash
+# ln -s ./oneformer/modeling/pixel_decoder/ops/ ./
+# ls
+# cd ops/ && bash make.sh && cd ..
+echo '----------------------------------------------------------------'
+echo '----------------------------------------------------------------'
+pip3 freeze | grep MultiScaleDeformableAttention
+pip3 freeze | grep torch
+pip3 freeze | grep detectron2
+pip3 freeze | grep natten
+echo '----------------------------------------------------------------'
+echo '----------------------------------------------------------------'
+# echo '----------------------------------------------------------------'
+# echo '----------------------------------------------------------------'
+# cd /home/user/.pyenv/versions/3.8.15/lib/python3.8/site-packages
+# ls
+# ls | grep MultiScale
+# echo '----------------------------------------------------------------'
+# echo '----------------------------------------------------------------'

deform_setup_cpu.sh ADDED Viewed

	@@ -0,0 +1,11 @@

+#!/usr/bin/env bash
+echo '----------------------------------------------------------------'
+echo 'CPU Setup - Skipping CUDA operations compilation'
+echo '----------------------------------------------------------------'
+pip3 freeze | grep torch
+pip3 freeze | grep detectron2
+pip3 freeze | grep natten
+echo '----------------------------------------------------------------'
+echo 'CPU setup complete'
+echo '----------------------------------------------------------------'

demo/colormap.py ADDED Viewed

	@@ -0,0 +1,170 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+"""
+An awesome colormap for really neat visualizations.
+Copied from Detectron, and removed gray colors.
+"""
+import numpy as np
+import random
+random.seed(0)
+__all__ = ["colormap", "random_color", "random_colors"]
+# fmt: off
+# RGB:
+# _COLORS = np.array(
+#     [
+#         0.000, 0.447, 0.741,
+#         0.850, 0.325, 0.098,
+#         0.929, 0.694, 0.125,
+#         0.494, 0.184, 0.556,
+#         0.466, 0.674, 0.188,
+#         0.301, 0.745, 0.933,
+#         0.635, 0.078, 0.184,
+#         0.300, 0.300, 0.300,
+#         0.600, 0.600, 0.600,
+#         1.000, 0.000, 0.000,
+#         1.000, 0.500, 0.000,
+#         0.749, 0.749, 0.000,
+#         0.000, 1.000, 0.000,
+#         0.000, 0.000, 1.000,
+#         0.667, 0.000, 1.000,
+#         0.333, 0.333, 0.000,
+#         0.333, 0.667, 0.000,
+#         0.333, 1.000, 0.000,
+#         0.667, 0.333, 0.000,
+#         0.667, 0.667, 0.000,
+#         0.667, 1.000, 0.000,
+#         1.000, 0.333, 0.000,
+#         1.000, 0.667, 0.000,
+#         1.000, 1.000, 0.000,
+#         0.000, 0.333, 0.500,
+#         0.000, 0.667, 0.500,
+#         0.000, 1.000, 0.500,
+#         0.333, 0.000, 0.500,
+#         0.333, 0.333, 0.500,
+#         0.333, 0.667, 0.500,
+#         0.333, 1.000, 0.500,
+#         0.667, 0.000, 0.500,
+#         0.667, 0.333, 0.500,
+#         0.667, 0.667, 0.500,
+#         0.667, 1.000, 0.500,
+#         1.000, 0.000, 0.500,
+#         1.000, 0.333, 0.500,
+#         1.000, 0.667, 0.500,
+#         1.000, 1.000, 0.500,
+#         0.000, 0.333, 1.000,
+#         0.000, 0.667, 1.000,
+#         0.000, 1.000, 1.000,
+#         0.333, 0.000, 1.000,
+#         0.333, 0.333, 1.000,
+#         0.333, 0.667, 1.000,
+#         0.333, 1.000, 1.000,
+#         0.667, 0.000, 1.000,
+#         0.667, 0.333, 1.000,
+#         0.667, 0.667, 1.000,
+#         0.667, 1.000, 1.000,
+#         1.000, 0.000, 1.000,
+#         1.000, 0.333, 1.000,
+#         1.000, 0.667, 1.000,
+#         0.333, 0.000, 0.000,
+#         0.500, 0.000, 0.000,
+#         0.667, 0.000, 0.000,
+#         0.833, 0.000, 0.000,
+#         1.000, 0.000, 0.000,
+#         0.000, 0.167, 0.000,
+#         0.000, 0.333, 0.000,
+#         0.000, 0.500, 0.000,
+#         0.000, 0.667, 0.000,
+#         0.000, 0.833, 0.000,
+#         0.000, 1.000, 0.000,
+#         0.000, 0.000, 0.167,
+#         0.000, 0.000, 0.333,
+#         0.000, 0.000, 0.500,
+#         0.000, 0.000, 0.667,
+#         0.000, 0.000, 0.833,
+#         0.000, 0.000, 1.000,
+#         0.000, 0.000, 0.000,
+#         0.143, 0.143, 0.143,
+#         0.857, 0.857, 0.857,
+#         1.000, 1.000, 1.000
+#     ]
+# ).astype(np.float32).reshape(-1, 3)
+# fmt: on
+_COLORS = []
+def gen_color():
+    color = tuple(np.round(np.random.choice(range(256), size=3)/255, 3))
+    if color not in _COLORS and np.mean(color) != 0.0:
+        _COLORS.append(color)
+    else:
+        gen_color()
+for _ in range(300):
+    gen_color()
+def colormap(rgb=False, maximum=255):
+    """
+    Args:
+        rgb (bool): whether to return RGB colors or BGR colors.
+        maximum (int): either 255 or 1
+    Returns:
+        ndarray: a float32 array of Nx3 colors, in range [0, 255] or [0, 1]
+    """
+    assert maximum in [255, 1], maximum
+    c = _COLORS * maximum
+    if not rgb:
+        c = c[:, ::-1]
+    return c
+def random_color(rgb=False, maximum=255):
+    """
+    Args:
+        rgb (bool): whether to return RGB colors or BGR colors.
+        maximum (int): either 255 or 1
+    Returns:
+        ndarray: a vector of 3 numbers
+    """
+    idx = np.random.randint(0, len(_COLORS))
+    ret = _COLORS[idx] * maximum
+    if not rgb:
+        ret = ret[::-1]
+    return ret
+def random_colors(N, rgb=False, maximum=255):
+    """
+    Args:
+        N (int): number of unique colors needed
+        rgb (bool): whether to return RGB colors or BGR colors.
+        maximum (int): either 255 or 1
+    Returns:
+        ndarray: a list of random_color
+    """
+    indices = random.sample(range(len(_COLORS)), N)
+    ret = [_COLORS[i] * maximum for i in indices]
+    if not rgb:
+        ret = [x[::-1] for x in ret]
+    return ret
+if __name__ == "__main__":
+    import cv2
+    size = 100
+    H, W = 10, 10
+    canvas = np.random.rand(H * size, W * size, 3).astype("float32")
+    for h in range(H):
+        for w in range(W):
+            idx = h * W + w
+            if idx >= len(_COLORS):
+                break
+            canvas[h * size : (h + 1) * size, w * size : (w + 1) * size] = _COLORS[idx]
+    cv2.imshow("a", canvas)
+    cv2.waitKey(0)

demo/defaults.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import torch
+import detectron2.data.transforms as T
+from detectron2.checkpoint import DetectionCheckpointer
+from detectron2.data import (
+    MetadataCatalog,
+)
+from detectron2.modeling import build_model
+__all__ = [
+    "DefaultPredictor",
+]
+class DefaultPredictor:
+    """
+    Create a simple end-to-end predictor with the given config that runs on
+    single device for a single input image.
+    Compared to using the model directly, this class does the following additions:
+    1. Load checkpoint from `cfg.MODEL.WEIGHTS`.
+    2. Always take BGR image as the input and apply conversion defined by `cfg.INPUT.FORMAT`.
+    3. Apply resizing defined by `cfg.INPUT.{MIN,MAX}_SIZE_TEST`.
+    4. Take one input image and produce a single output, instead of a batch.
+    This is meant for simple demo purposes, so it does the above steps automatically.
+    This is not meant for benchmarks or running complicated inference logic.
+    If you'd like to do anything more complicated, please refer to its source code as
+    examples to build and use the model manually.
+    Attributes:
+        metadata (Metadata): the metadata of the underlying dataset, obtained from
+            cfg.DATASETS.TEST.
+    Examples:
+    ::
+        pred = DefaultPredictor(cfg)
+        inputs = cv2.imread("input.jpg")
+        outputs = pred(inputs)
+    """
+    def __init__(self, cfg):
+        self.cfg = cfg.clone()  # cfg can be modified by model
+        self.model = build_model(self.cfg)
+        self.model.eval()
+        if len(cfg.DATASETS.TEST):
+            self.metadata = MetadataCatalog.get(cfg.DATASETS.TEST[0])
+        checkpointer = DetectionCheckpointer(self.model)
+        checkpointer.load(cfg.MODEL.WEIGHTS)
+        self.aug = T.ResizeShortestEdge(
+            [cfg.INPUT.MIN_SIZE_TEST, cfg.INPUT.MIN_SIZE_TEST], cfg.INPUT.MAX_SIZE_TEST
+        )
+        self.input_format = cfg.INPUT.FORMAT
+        assert self.input_format in ["RGB", "BGR"], self.input_format
+    def __call__(self, original_image, task):
+        """
+        Args:
+            original_image (np.ndarray): an image of shape (H, W, C) (in BGR order).
+        Returns:
+            predictions (dict):
+                the output of the model for one image only.
+                See :doc:`/tutorials/models` for details about the format.
+        """
+        with torch.no_grad():  # https://github.com/sphinx-doc/sphinx/issues/4258
+            # Apply pre-processing to image.
+            if self.input_format == "RGB":
+                # whether the model expects BGR inputs or RGB
+                original_image = original_image[:, :, ::-1]
+            height, width = original_image.shape[:2]
+            image = self.aug.get_transform(original_image).apply_image(original_image)
+            image = torch.as_tensor(image.astype("float32").transpose(2, 0, 1))
+            task = f"The task is {task}"
+            inputs = {"image": image, "height": height, "width": width, "task": task}
+            predictions = self.model([inputs])[0]
+            return predictions

demo/predictor.py ADDED Viewed

	@@ -0,0 +1,190 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+# Copied from: https://github.com/facebookresearch/detectron2/blob/master/demo/predictor.py
+import atexit
+import bisect
+import multiprocessing as mp
+from collections import deque
+import cv2
+import torch
+from detectron2.data import MetadataCatalog
+from defaults import DefaultPredictor
+from detectron2.utils.video_visualizer import VideoVisualizer
+from visualizer import ColorMode, Visualizer
+class VisualizationDemo(object):
+    def __init__(self, cfg, instance_mode=ColorMode.IMAGE, parallel=False):
+        """
+        Args:
+            cfg (CfgNode):
+            instance_mode (ColorMode):
+            parallel (bool): whether to run the model in different processes from visualization.
+                Useful since the visualization logic can be slow.
+        """
+        self.metadata = MetadataCatalog.get(
+            cfg.DATASETS.TEST[0] if len(cfg.DATASETS.TEST) else "__unused"
+        )
+        if 'cityscapes_fine_sem_seg_val' in cfg.DATASETS.TEST[0]:
+            from cityscapesscripts.helpers.labels import labels
+            stuff_colors = [k.color for k in labels if k.trainId != 255]
+            self.metadata = self.metadata.set(stuff_colors=stuff_colors)
+        self.cpu_device = torch.device("cpu")
+        self.instance_mode = instance_mode
+        self.parallel = parallel
+        if parallel:
+            num_gpu = torch.cuda.device_count()
+            self.predictor = AsyncPredictor(cfg, num_gpus=num_gpu)
+        else:
+            self.predictor = DefaultPredictor(cfg)
+    def run_on_image(self, image, task, sem_gt, pan_gt, ins_gt, box_gt):
+        """
+        Args:
+            image (np.ndarray): an image of shape (H, W, C) (in BGR order).
+                This is the format used by OpenCV.
+        Returns:
+            predictions (dict): the output of the model.
+            vis_output (VisImage): the visualized image output.
+        """
+        vis_output = None
+        # Convert image from OpenCV BGR format to Matplotlib RGB format.
+        image = image[:, :, ::-1]
+        vis_output = {}
+        if task == 'panoptic':
+            visualizer = Visualizer(image, metadata=self.metadata, instance_mode=0)
+            predictions = self.predictor(image, "panoptic")
+            panoptic_seg, segments_info = predictions["panoptic_seg"]
+            vis_output['panoptic'] = visualizer.draw_panoptic_seg_predictions(
+            panoptic_seg.to(self.cpu_device), segments_info, alpha=1
+        )
+            # visualizer = Visualizer(image, metadata=self.metadata, instance_mode=0)
+            # vis_output['pan_gt'] = visualizer.draw_panoptic_seg(
+            #     pan_gt[0].to(self.cpu_device), pan_gt[1], alpha=1
+            # )
+        if task == 'panoptic' or task == 'semantic':
+            visualizer = Visualizer(image, metadata=self.metadata, instance_mode=1)
+            predictions = self.predictor(image, "semantic")
+            vis_output['semantic'] = visualizer.draw_sem_seg(
+                predictions["sem_seg"].argmax(dim=0).to(self.cpu_device), alpha=1
+            )
+            # visualizer = Visualizer(image, metadata=self.metadata, instance_mode=1)
+            # vis_output['gt_sem'] = visualizer.draw_sem_seg(
+            #     sem_gt.to(self.cpu_device), alpha=1
+            # )
+        if task == 'panoptic' or task == 'instance':
+            visualizer = Visualizer(image, metadata=self.metadata, instance_mode=2)
+            predictions = self.predictor(image, "instance")
+            instances = predictions["instances"].to(self.cpu_device)
+            vis_output['instance'] = visualizer.draw_instance_predictions(predictions=instances, alpha=1)
+            if 'boxes' in predictions:
+                boxes, labels, scores  = predictions["boxes"]
+                visualizer = Visualizer(image, False, metadata=self.metadata, instance_mode=0)
+                vis_output['boxes'] = visualizer.draw_box_predictions(
+                        boxes.to(self.cpu_device), labels.to(self.cpu_device), scores.to(self.cpu_device))
+            # visualizer = Visualizer(image, metadata=self.metadata, instance_mode=2)
+            # vis_output['ins_gt'] = visualizer.draw_instance_predictions(predictions=ins_gt.to(self.cpu_device), alpha=1)
+        # vis_output['input'] = visualizer.get_image(image)
+        return predictions, vis_output
+class AsyncPredictor:
+    """
+    A predictor that runs the model asynchronously, possibly on >1 GPUs.
+    Because rendering the visualization takes considerably amount of time,
+    this helps improve throughput a little bit when rendering videos.
+    """
+    class _StopToken:
+        pass
+    class _PredictWorker(mp.Process):
+        def __init__(self, cfg, task_queue, result_queue):
+            self.cfg = cfg
+            self.task_queue = task_queue
+            self.result_queue = result_queue
+            super().__init__()
+        def run(self):
+            predictor = DefaultPredictor(self.cfg)
+            while True:
+                task = self.task_queue.get()
+                if isinstance(task, AsyncPredictor._StopToken):
+                    break
+                idx, data = task
+                result = predictor(data)
+                self.result_queue.put((idx, result))
+    def __init__(self, cfg, num_gpus: int = 1):
+        """
+        Args:
+            cfg (CfgNode):
+            num_gpus (int): if 0, will run on CPU
+        """
+        num_workers = max(num_gpus, 1)
+        self.task_queue = mp.Queue(maxsize=num_workers * 3)
+        self.result_queue = mp.Queue(maxsize=num_workers * 3)
+        self.procs = []
+        for gpuid in range(max(num_gpus, 1)):
+            cfg = cfg.clone()
+            cfg.defrost()
+            cfg.MODEL.DEVICE = "cuda:{}".format(gpuid) if num_gpus > 0 else "cpu"
+            self.procs.append(
+                AsyncPredictor._PredictWorker(cfg, self.task_queue, self.result_queue)
+            )
+        self.put_idx = 0
+        self.get_idx = 0
+        self.result_rank = []
+        self.result_data = []
+        for p in self.procs:
+            p.start()
+        atexit.register(self.shutdown)
+    def put(self, image):
+        self.put_idx += 1
+        self.task_queue.put((self.put_idx, image))
+    def get(self):
+        self.get_idx += 1  # the index needed for this request
+        if len(self.result_rank) and self.result_rank[0] == self.get_idx:
+            res = self.result_data[0]
+            del self.result_data[0], self.result_rank[0]
+            return res
+        while True:
+            # make sure the results are returned in the correct order
+            idx, res = self.result_queue.get()
+            if idx == self.get_idx:
+                return res
+            insert = bisect.bisect(self.result_rank, idx)
+            self.result_rank.insert(insert, idx)
+            self.result_data.insert(insert, res)
+    def __len__(self):
+        return self.put_idx - self.get_idx
+    def __call__(self, image):
+        self.put(image)
+        return self.get()
+    def shutdown(self):
+        for _ in self.procs:
+            self.task_queue.put(AsyncPredictor._StopToken())
+    @property
+    def default_buffer_size(self):
+        return len(self.procs) * 5

demo/visualizer.py ADDED Viewed

	@@ -0,0 +1,1350 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+import colorsys
+import logging
+import math
+import numpy as np
+from enum import Enum, unique
+import cv2
+import matplotlib as mpl
+import matplotlib.colors as mplc
+import matplotlib.figure as mplfigure
+import pycocotools.mask as mask_util
+import torch
+from matplotlib.backends.backend_agg import FigureCanvasAgg
+from PIL import Image
+from detectron2.data import MetadataCatalog
+from detectron2.structures import BitMasks, Boxes, BoxMode, Keypoints, PolygonMasks, RotatedBoxes
+from detectron2.utils.file_io import PathManager
+import random
+random.seed(0)
+from .colormap import random_color, _COLORS
+logger = logging.getLogger(__name__)
+__all__ = ["ColorMode", "VisImage", "Visualizer"]
+_SMALL_OBJECT_AREA_THRESH = 1000
+_LARGE_MASK_AREA_THRESH = 120000
+_OFF_WHITE = (1.0, 1.0, 240.0 / 255)
+_BLACK = (0, 0, 0)
+_RED = (1.0, 0, 0)
+_KEYPOINT_THRESHOLD = 0.05
+def instance_color(rgb=False, idx=1, maximum=255):
+    """
+    Args:
+        rgb (bool): whether to return RGB colors or BGR colors.
+        maximum (int): either 255 or 1
+    Returns:
+        ndarray: a vector of 3 numbers
+    """
+    ret = _COLORS[idx] * maximum
+    if not rgb:
+        ret = ret[::-1]
+    return ret
+@unique
+class ColorMode(Enum):
+    """
+    Enum of different color modes to use for instance visualizations.
+    """
+    IMAGE = 0
+    """
+    Picks a random color for every instance and overlay segmentations with low opacity.
+    """
+    SEGMENTATION = 1
+    """
+    Let instances of the same category have similar colors
+    (from metadata.thing_colors), and overlay them with
+    high opacity. This provides more attention on the quality of segmentation.
+    """
+    IMAGE_BW = 2
+    """
+    Same as IMAGE, but convert all areas without masks to gray-scale.
+    Only available for drawing per-instance mask predictions.
+    """
+class GenericMask:
+    """
+    Attribute:
+        polygons (list[ndarray]): list[ndarray]: polygons for this mask.
+            Each ndarray has format [x, y, x, y, ...]
+        mask (ndarray): a binary mask
+    """
+    def __init__(self, mask_or_polygons, height, width):
+        self._mask = self._polygons = self._has_holes = None
+        self.height = height
+        self.width = width
+        m = mask_or_polygons
+        if isinstance(m, dict):
+            # RLEs
+            assert "counts" in m and "size" in m
+            if isinstance(m["counts"], list):  # uncompressed RLEs
+                h, w = m["size"]
+                assert h == height and w == width
+                m = mask_util.frPyObjects(m, h, w)
+            self._mask = mask_util.decode(m)[:, :]
+            return
+        if isinstance(m, list):  # list[ndarray]
+            self._polygons = [np.asarray(x).reshape(-1) for x in m]
+            return
+        if isinstance(m, np.ndarray):  # assumed to be a binary mask
+            assert m.shape[1] != 2, m.shape
+            assert m.shape == (
+                height,
+                width,
+            ), f"mask shape: {m.shape}, target dims: {height}, {width}"
+            self._mask = m.astype("uint8")
+            return
+        raise ValueError("GenericMask cannot handle object {} of type '{}'".format(m, type(m)))
+    @property
+    def mask(self):
+        if self._mask is None:
+            self._mask = self.polygons_to_mask(self._polygons)
+        return self._mask
+    @property
+    def polygons(self):
+        if self._polygons is None:
+            self._polygons, self._has_holes = self.mask_to_polygons(self._mask)
+        return self._polygons
+    @property
+    def has_holes(self):
+        if self._has_holes is None:
+            if self._mask is not None:
+                self._polygons, self._has_holes = self.mask_to_polygons(self._mask)
+            else:
+                self._has_holes = False  # if original format is polygon, does not have holes
+        return self._has_holes
+    def mask_to_polygons(self, mask):
+        # cv2.RETR_CCOMP flag retrieves all the contours and arranges them to a 2-level
+        # hierarchy. External contours (boundary) of the object are placed in hierarchy-1.
+        # Internal contours (holes) are placed in hierarchy-2.
+        # cv2.CHAIN_APPROX_NONE flag gets vertices of polygons from contours.
+        mask = np.ascontiguousarray(mask)  # some versions of cv2 does not support incontiguous arr
+        res = cv2.findContours(mask.astype("uint8"), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_NONE)
+        hierarchy = res[-1]
+        if hierarchy is None:  # empty mask
+            return [], False
+        has_holes = (hierarchy.reshape(-1, 4)[:, 3] >= 0).sum() > 0
+        res = res[-2]
+        res = [x.flatten() for x in res]
+        # These coordinates from OpenCV are integers in range [0, W-1 or H-1].
+        # We add 0.5 to turn them into real-value coordinate space. A better solution
+        # would be to first +0.5 and then dilate the returned polygon by 0.5.
+        res = [x + 0.5 for x in res if len(x) >= 6]
+        return res, has_holes
+    def polygons_to_mask(self, polygons):
+        rle = mask_util.frPyObjects(polygons, self.height, self.width)
+        rle = mask_util.merge(rle)
+        return mask_util.decode(rle)[:, :]
+    def area(self):
+        return self.mask.sum()
+    def bbox(self):
+        p = mask_util.frPyObjects(self.polygons, self.height, self.width)
+        p = mask_util.merge(p)
+        bbox = mask_util.toBbox(p)
+        bbox[2] += bbox[0]
+        bbox[3] += bbox[1]
+        return bbox
+class _PanopticPrediction:
+    """
+    Unify different panoptic annotation/prediction formats
+    """
+    def __init__(self, panoptic_seg, segments_info, metadata=None):
+        if segments_info is None:
+            assert metadata is not None
+            # If "segments_info" is None, we assume "panoptic_img" is a
+            # H*W int32 image storing the panoptic_id in the format of
+            # category_id * label_divisor + instance_id. We reserve -1 for
+            # VOID label.
+            label_divisor = metadata.label_divisor
+            segments_info = []
+            for panoptic_label in np.unique(panoptic_seg.numpy()):
+                if panoptic_label == -1:
+                    # VOID region.
+                    continue
+                pred_class = panoptic_label // label_divisor
+                isthing = pred_class in metadata.thing_dataset_id_to_contiguous_id.values()
+                segments_info.append(
+                    {
+                        "id": int(panoptic_label),
+                        "category_id": int(pred_class),
+                        "isthing": bool(isthing),
+                    }
+                )
+        del metadata
+        self._seg = panoptic_seg
+        self._sinfo = {s["id"]: s for s in segments_info}  # seg id -> seg info
+        segment_ids, areas = torch.unique(panoptic_seg, sorted=True, return_counts=True)
+        areas = areas.numpy()
+        sorted_idxs = np.argsort(-areas)
+        self._seg_ids, self._seg_areas = segment_ids[sorted_idxs], areas[sorted_idxs]
+        self._seg_ids = self._seg_ids.tolist()
+        for sid, area in zip(self._seg_ids, self._seg_areas):
+            if sid in self._sinfo:
+                self._sinfo[sid]["area"] = float(area)
+    def non_empty_mask(self):
+        """
+        Returns:
+            (H, W) array, a mask for all pixels that have a prediction
+        """
+        empty_ids = []
+        for id in self._seg_ids:
+            if id not in self._sinfo:
+                empty_ids.append(id)
+        if len(empty_ids) == 0:
+            return np.zeros(self._seg.shape, dtype=np.uint8)
+        assert (
+            len(empty_ids) == 1
+        ), ">1 ids corresponds to no labels. This is currently not supported"
+        return (self._seg != empty_ids[0]).numpy().astype(np.bool)
+    def semantic_masks(self):
+        for sid in self._seg_ids:
+            sinfo = self._sinfo.get(sid)
+            if sinfo is None or sinfo["isthing"]:
+                # Some pixels (e.g. id 0 in PanopticFPN) have no instance or semantic predictions.
+                continue
+            yield (self._seg == sid).numpy().astype(np.bool), sinfo
+    def instance_masks(self):
+        for sid in self._seg_ids:
+            sinfo = self._sinfo.get(sid)
+            if sinfo is None or not sinfo["isthing"]:
+                continue
+            mask = (self._seg == sid).numpy().astype(np.bool)
+            if mask.sum() > 0:
+                yield mask, sinfo
+def _create_text_labels(classes, scores, class_names, is_crowd=None):
+    """
+    Args:
+        classes (list[int] or None):
+        scores (list[float] or None):
+        class_names (list[str] or None):
+        is_crowd (list[bool] or None):
+    Returns:
+        list[str] or None
+    """
+    labels = None
+    if classes is not None:
+        if class_names is not None and len(class_names) > 0:
+            labels = [class_names[i] for i in classes]
+        else:
+            labels = [str(i) for i in classes]
+    if scores is not None:
+        if labels is None:
+            labels = ["{:.0f}%".format(s * 100) for s in scores]
+        else:
+            labels = ["{} {:.0f}%".format(l, s * 100) for l, s in zip(labels, scores)]
+    if labels is not None and is_crowd is not None:
+        labels = [l + ("|crowd" if crowd else "") for l, crowd in zip(labels, is_crowd)]
+    return labels
+class VisImage:
+    def __init__(self, img, scale=1.0):
+        """
+        Args:
+            img (ndarray): an RGB image of shape (H, W, 3) in range [0, 255].
+            scale (float): scale the input image
+        """
+        self.img = img
+        self.scale = scale
+        self.width, self.height = img.shape[1], img.shape[0]
+        self._setup_figure(img)
+    def _setup_figure(self, img):
+        """
+        Args:
+            Same as in :meth:`__init__()`.
+        Returns:
+            fig (matplotlib.pyplot.figure): top level container for all the image plot elements.
+            ax (matplotlib.pyplot.Axes): contains figure elements and sets the coordinate system.
+        """
+        fig = mplfigure.Figure(frameon=False)
+        self.dpi = fig.get_dpi()
+        # add a small 1e-2 to avoid precision lost due to matplotlib's truncation
+        # (https://github.com/matplotlib/matplotlib/issues/15363)
+        fig.set_size_inches(
+            (self.width * self.scale + 1e-2) / self.dpi,
+            (self.height * self.scale + 1e-2) / self.dpi,
+        )
+        self.canvas = FigureCanvasAgg(fig)
+        # self.canvas = mpl.backends.backend_cairo.FigureCanvasCairo(fig)
+        ax = fig.add_axes([0.0, 0.0, 1.0, 1.0])
+        ax.axis("off")
+        self.fig = fig
+        self.ax = ax
+        self.reset_image(img)
+    def reset_image(self, img):
+        """
+        Args:
+            img: same as in __init__
+        """
+        img = img.astype("uint8")
+        self.ax.imshow(img, extent=(0, self.width, self.height, 0), interpolation="nearest")
+    def save(self, filepath):
+        """
+        Args:
+            filepath (str): a string that contains the absolute path, including the file name, where
+                the visualized image will be saved.
+        """
+        self.fig.savefig(filepath)
+    def get_image(self):
+        """
+        Returns:
+            ndarray:
+                the visualized image of shape (H, W, 3) (RGB) in uint8 type.
+                The shape is scaled w.r.t the input image using the given `scale` argument.
+        """
+        canvas = self.canvas
+        s, (width, height) = canvas.print_to_buffer()
+        # buf = io.BytesIO()  # works for cairo backend
+        # canvas.print_rgba(buf)
+        # width, height = self.width, self.height
+        # s = buf.getvalue()
+        buffer = np.frombuffer(s, dtype="uint8")
+        img_rgba = buffer.reshape(height, width, 4)
+        rgb, alpha = np.split(img_rgba, [3], axis=2)
+        return rgb.astype("uint8")
+class Visualizer:
+    """
+    Visualizer that draws data about detection/segmentation on images.
+    It contains methods like `draw_{text,box,circle,line,binary_mask,polygon}`
+    that draw primitive objects to images, as well as high-level wrappers like
+    `draw_{instance_predictions,sem_seg,panoptic_seg_predictions,dataset_dict}`
+    that draw composite data in some pre-defined style.
+    Note that the exact visualization style for the high-level wrappers are subject to change.
+    Style such as color, opacity, label contents, visibility of labels, or even the visibility
+    of objects themselves (e.g. when the object is too small) may change according
+    to different heuristics, as long as the results still look visually reasonable.
+    To obtain a consistent style, you can implement custom drawing functions with the
+    abovementioned primitive methods instead. If you need more customized visualization
+    styles, you can process the data yourself following their format documented in
+    tutorials (:doc:`/tutorials/models`, :doc:`/tutorials/datasets`). This class does not
+    intend to satisfy everyone's preference on drawing styles.
+    This visualizer focuses on high rendering quality rather than performance. It is not
+    designed to be used for real-time applications.
+    """
+    # TODO implement a fast, rasterized version using OpenCV
+    def __init__(self, img_rgb, is_img=True, metadata=None, scale=1.0, instance_mode=ColorMode.IMAGE):
+        """
+        Args:
+            img_rgb: a numpy array of shape (H, W, C), where H and W correspond to
+                the height and width of the image respectively. C is the number of
+                color channels. The image is required to be in RGB format since that
+                is a requirement of the Matplotlib library. The image is also expected
+                to be in the range [0, 255].
+            metadata (Metadata): dataset metadata (e.g. class names and colors)
+            instance_mode (ColorMode): defines one of the pre-defined style for drawing
+                instances on an image.
+        """
+        if is_img:
+            self.img = np.asarray(img_rgb).clip(0, 255).astype(np.uint8)
+        else:
+            self.img = np.zeros_like(img_rgb).clip(0, 255).astype(np.uint8)
+        if metadata is None:
+            metadata = MetadataCatalog.get("__nonexist__")
+        self.metadata = metadata
+        self.output = VisImage(self.img, scale=scale)
+        self.cpu_device = torch.device("cpu")
+        # too small texts are useless, therefore clamp to 9
+        self._default_font_size = max(
+            np.sqrt(self.output.height * self.output.width) // 90, 10 // scale
+        )
+        self._instance_mode = instance_mode
+        self.keypoint_threshold = _KEYPOINT_THRESHOLD
+    def get_image(self, img):
+        img = np.asarray(img).clip(0, 255).astype(np.uint8)
+        return VisImage(img, scale=1.0)
+    def draw_box_predictions(
+        self,
+        boxes=None,
+        labels=None,
+        scores=None,
+        assigned_colors=None
+    ):
+        """
+        Args:
+            boxes (Boxes, RotatedBoxes or ndarray): either a :class:`Boxes`,
+                or an Nx4 numpy array of XYXY_ABS format for the N objects in a single image,
+                or a :class:`RotatedBoxes`,
+                or an Nx5 numpy array of (x_center, y_center, width, height, angle_degrees) format
+                for the N objects in a single image,
+            labels (list[str]): the text to be displayed for each instance.
+            assigned_colors (list[matplotlib.colors]): a list of colors, where each color
+                corresponds to each mask or box in the image. Refer to 'matplotlib.colors'
+                for full list of formats that the colors are accepted in.
+        Returns:
+            output (VisImage): image object with visualizations.
+        """
+        num_instances = 0
+        boxes = self._convert_boxes(boxes)
+        classes = labels.tolist()
+        scores = scores.tolist()
+        labels = _create_text_labels(classes, scores, self.metadata.get("stuff_classes", None))
+        num_instances = len(boxes)
+        assert len(labels) == num_instances
+        if assigned_colors is None:
+            # assigned_colors = [random_color(rgb=True, maximum=1) for _ in range(num_instances)]
+            assigned_colors = [instance_color(rgb=True, idx=i, maximum=1) for i in range(num_instances)]
+        if num_instances == 0:
+            return self.output
+        # Display in largest to smallest order to reduce occlusion.
+        areas = None
+        areas = np.prod(boxes[:, 2:] - boxes[:, :2], axis=1)
+        if areas is not None:
+            sorted_idxs = np.argsort(-areas).tolist()
+            # Re-order overlapped instances in descending order.
+            boxes = boxes[sorted_idxs] if boxes is not None else None
+            labels = [labels[k] for k in sorted_idxs] if labels is not None else None
+            assigned_colors = [assigned_colors[idx] for idx in sorted_idxs]
+        for i in range(num_instances):
+            color = assigned_colors[i]
+            if boxes is not None:
+                self.draw_box(boxes[i], edge_color=color)
+            if labels is not None:
+                # first get a box
+                if boxes is not None:
+                    x0, y0, x1, y1 = boxes[i]
+                    text_pos = (x0, y0)  # if drawing boxes, put text on the box corner.
+                    horiz_align = "left"
+                else:
+                    continue  # drawing the box confidence for keypoints isn't very useful.
+                # for small objects, draw text at the side to avoid occlusion
+                instance_area = (y1 - y0) * (x1 - x0)
+                if (
+                    instance_area < _SMALL_OBJECT_AREA_THRESH * self.output.scale
+                    or y1 - y0 < 40 * self.output.scale
+                ):
+                    if y1 >= self.output.height - 5:
+                        text_pos = (x1, y0)
+                    else:
+                        text_pos = (x0, y1)
+                height_ratio = (y1 - y0) / np.sqrt(self.output.height * self.output.width)
+                lighter_color = self._change_color_brightness(color, brightness_factor=0.7)
+                font_size = (
+                    np.clip((height_ratio - 0.02) / 0.08 + 1, 1.2, 2)
+                    * 0.5
+                    * self._default_font_size
+                )
+                self.draw_text(
+                    labels[i],
+                    text_pos,
+                    color=lighter_color,
+                    horizontal_alignment=horiz_align,
+                    font_size=font_size,
+                )
+        return self.output
+    def draw_instance_predictions(self, predictions, alpha=0.8, is_text=True):
+        """
+        Draw instance-level prediction results on an image.
+        Args:
+            predictions (Instances): the output of an instance detection/segmentation
+                model. Following fields will be used to draw:
+                "pred_boxes", "pred_classes", "scores", "pred_masks" (or "pred_masks_rle").
+        Returns:
+            output (VisImage): image object with visualizations.
+        """
+        boxes = predictions.pred_boxes if predictions.has("pred_boxes") else None
+        scores = predictions.scores if predictions.has("scores") else None
+        classes = predictions.pred_classes.tolist() if predictions.has("pred_classes") else None
+        labels = _create_text_labels(classes, scores, self.metadata.get("stuff_classes", None))
+        keypoints = predictions.pred_keypoints if predictions.has("pred_keypoints") else None
+        if predictions.has("pred_masks"):
+            masks = np.asarray(predictions.pred_masks)
+            masks = [GenericMask(x, self.output.height, self.output.width) for x in masks]
+        else:
+            masks = None
+        if self._instance_mode == ColorMode.SEGMENTATION and self.metadata.get("stuff_colors"):
+            # colors = [
+            #     self._jitter([x / 255 for x in self.metadata.thing_colors[c]]) for c in classes
+            # ]
+            colors = [
+                instance_color(rgb=True, idx=c, maximum=1) for c in classes
+            ]
+        else:
+            colors = None
+        if self._instance_mode == ColorMode.IMAGE_BW:
+            self.output.reset_image(
+                self._create_grayscale_image(
+                    (predictions.pred_masks.any(dim=0) > 0).numpy()
+                    if predictions.has("pred_masks")
+                    else None
+                )
+            )
+        self.overlay_instances(
+            masks=masks,
+            boxes=boxes,
+            labels=labels,
+            keypoints=keypoints,
+            assigned_colors=colors,
+            alpha=alpha,
+            is_text=is_text,
+        )
+        return self.output
+    def draw_sem_seg(self, sem_seg, area_threshold=None, alpha=0.8, is_text=True):
+        """
+        Draw semantic segmentation predictions/labels.
+        Args:
+            sem_seg (Tensor or ndarray): the segmentation of shape (H, W).
+                Each value is the integer label of the pixel.
+            area_threshold (int): segments with less than `area_threshold` are not drawn.
+            alpha (float): the larger it is, the more opaque the segmentations are.
+        Returns:
+            output (VisImage): image object with visualizations.
+        """
+        if isinstance(sem_seg, torch.Tensor):
+            sem_seg = sem_seg.numpy()
+        labels, areas = np.unique(sem_seg, return_counts=True)
+        sorted_idxs = np.argsort(-areas).tolist()
+        labels = labels[sorted_idxs]
+        for label in filter(lambda l: l < len(self.metadata.stuff_classes), labels):
+            try:
+                mask_color = [x / 255 for x in self.metadata.stuff_colors[label]]
+            except (AttributeError, IndexError):
+                mask_color = None
+            binary_mask = (sem_seg == label).astype(np.uint8)
+            text = self.metadata.stuff_classes[label]
+            self.draw_binary_mask(
+                binary_mask,
+                color=mask_color,
+                edge_color=_OFF_WHITE,
+                text=text,
+                alpha=alpha,
+                area_threshold=area_threshold,
+                is_text=is_text,
+            )
+        return self.output
+    def draw_panoptic_seg(self, panoptic_seg, segments_info, area_threshold=None, alpha=0.7, is_text=True,):
+        """
+        Draw panoptic prediction annotations or results.
+        Args:
+            panoptic_seg (Tensor): of shape (height, width) where the values are ids for each
+                segment.
+            segments_info (list[dict] or None): Describe each segment in `panoptic_seg`.
+                If it is a ``list[dict]``, each dict contains keys "id", "category_id".
+                If None, category id of each pixel is computed by
+                ``pixel // metadata.label_divisor``.
+            area_threshold (int): stuff segments with less than `area_threshold` are not drawn.
+        Returns:
+            output (VisImage): image object with visualizations.
+        """
+        pred = _PanopticPrediction(panoptic_seg, segments_info, self.metadata)
+        if self._instance_mode == ColorMode.IMAGE_BW:
+            self.output.reset_image(self._create_grayscale_image(pred.non_empty_mask()))
+        # draw mask for all semantic segments first i.e. "stuff"
+        for mask, sinfo in pred.semantic_masks():
+            category_idx = sinfo["category_id"]
+            try:
+                mask_color = [x / 255 for x in self.metadata.stuff_colors[category_idx]]
+            except AttributeError:
+                mask_color = None
+            text = self.metadata.stuff_classes[category_idx]
+            self.draw_binary_mask(
+                mask,
+                color=mask_color,
+                edge_color=_OFF_WHITE,
+                text=text,
+                alpha=alpha,
+                area_threshold=area_threshold,
+                is_text=is_text,
+            )
+        # draw mask for all instances second
+        all_instances = list(pred.instance_masks())
+        if len(all_instances) == 0:
+            return self.output
+        masks, sinfo = list(zip(*all_instances))
+        category_ids = [x["category_id"] for x in sinfo]
+        try:
+            scores = [x["score"] for x in sinfo]
+        except KeyError:
+            scores = None
+        labels = _create_text_labels(
+            category_ids, scores, self.metadata.stuff_classes, [x.get("iscrowd", 0) for x in sinfo]
+        )
+        try:
+            colors = [
+                self._jitter([x / 255 for x in self.metadata.stuff_colors[c]]) for c in category_ids
+            ]
+        except AttributeError:
+            colors = None
+        self.overlay_instances(masks=masks, labels=labels, assigned_colors=colors, alpha=alpha, is_text=is_text)
+        return self.output
+    draw_panoptic_seg_predictions = draw_panoptic_seg  # backward compatibility
+    def draw_dataset_dict(self, dic):
+        """
+        Draw annotations/segmentaions in Detectron2 Dataset format.
+        Args:
+            dic (dict): annotation/segmentation data of one image, in Detectron2 Dataset format.
+        Returns:
+            output (VisImage): image object with visualizations.
+        """
+        annos = dic.get("annotations", None)
+        if annos:
+            if "segmentation" in annos[0]:
+                masks = [x["segmentation"] for x in annos]
+            else:
+                masks = None
+            if "keypoints" in annos[0]:
+                keypts = [x["keypoints"] for x in annos]
+                keypts = np.array(keypts).reshape(len(annos), -1, 3)
+            else:
+                keypts = None
+            boxes = [
+                BoxMode.convert(x["bbox"], x["bbox_mode"], BoxMode.XYXY_ABS)
+                if len(x["bbox"]) == 4
+                else x["bbox"]
+                for x in annos
+            ]
+            colors = None
+            category_ids = [x["category_id"] for x in annos]
+            if self._instance_mode == ColorMode.SEGMENTATION and self.metadata.get("stuff_colors"):
+                colors = [
+                    self._jitter([x / 255 for x in self.metadata.stuff_colors[c]])
+                    for c in category_ids
+                ]
+            names = self.metadata.get("stuff_classes", None)
+            labels = _create_text_labels(
+                category_ids,
+                scores=None,
+                class_names=names,
+                is_crowd=[x.get("iscrowd", 0) for x in annos],
+            )
+            self.overlay_instances(
+                labels=labels, boxes=boxes, masks=masks, keypoints=keypts, assigned_colors=colors
+            )
+        sem_seg = dic.get("sem_seg", None)
+        if sem_seg is None and "sem_seg_file_name" in dic:
+            with PathManager.open(dic["sem_seg_file_name"], "rb") as f:
+                sem_seg = Image.open(f)
+                sem_seg = np.asarray(sem_seg, dtype="uint8")
+        if sem_seg is not None:
+            self.draw_sem_seg(sem_seg, area_threshold=0, alpha=0.5)
+        pan_seg = dic.get("pan_seg", None)
+        if pan_seg is None and "pan_seg_file_name" in dic:
+            with PathManager.open(dic["pan_seg_file_name"], "rb") as f:
+                pan_seg = Image.open(f)
+                pan_seg = np.asarray(pan_seg)
+                from panopticapi.utils import rgb2id
+                pan_seg = rgb2id(pan_seg)
+        if pan_seg is not None:
+            segments_info = dic["segments_info"]
+            pan_seg = torch.tensor(pan_seg)
+            self.draw_panoptic_seg(pan_seg, segments_info, area_threshold=0, alpha=0.5)
+        return self.output
+    def overlay_instances(
+        self,
+        *,
+        boxes=None,
+        labels=None,
+        masks=None,
+        keypoints=None,
+        assigned_colors=None,
+        alpha=0.5,
+        is_text=True,
+    ):
+        """
+        Args:
+            boxes (Boxes, RotatedBoxes or ndarray): either a :class:`Boxes`,
+                or an Nx4 numpy array of XYXY_ABS format for the N objects in a single image,
+                or a :class:`RotatedBoxes`,
+                or an Nx5 numpy array of (x_center, y_center, width, height, angle_degrees) format
+                for the N objects in a single image,
+            labels (list[str]): the text to be displayed for each instance.
+            masks (masks-like object): Supported types are:
+                * :class:`detectron2.structures.PolygonMasks`,
+                  :class:`detectron2.structures.BitMasks`.
+                * list[list[ndarray]]: contains the segmentation masks for all objects in one image.
+                  The first level of the list corresponds to individual instances. The second
+                  level to all the polygon that compose the instance, and the third level
+                  to the polygon coordinates. The third level should have the format of
+                  [x0, y0, x1, y1, ..., xn, yn] (n >= 3).
+                * list[ndarray]: each ndarray is a binary mask of shape (H, W).
+                * list[dict]: each dict is a COCO-style RLE.
+            keypoints (Keypoint or array like): an array-like object of shape (N, K, 3),
+                where the N is the number of instances and K is the number of keypoints.
+                The last dimension corresponds to (x, y, visibility or score).
+            assigned_colors (list[matplotlib.colors]): a list of colors, where each color
+                corresponds to each mask or box in the image. Refer to 'matplotlib.colors'
+                for full list of formats that the colors are accepted in.
+        Returns:
+            output (VisImage): image object with visualizations.
+        """
+        num_instances = 0
+        if boxes is not None:
+            boxes = self._convert_boxes(boxes)
+            num_instances = len(boxes)
+        if masks is not None:
+            masks = self._convert_masks(masks)
+            if num_instances:
+                assert len(masks) == num_instances
+            else:
+                num_instances = len(masks)
+        if keypoints is not None:
+            if num_instances:
+                assert len(keypoints) == num_instances
+            else:
+                num_instances = len(keypoints)
+            keypoints = self._convert_keypoints(keypoints)
+        if labels is not None:
+            assert len(labels) == num_instances
+        if assigned_colors is None:
+            # assigned_colors = [random_color(rgb=True, maximum=1) for _ in range(num_instances)]
+            assigned_colors = [instance_color(rgb=True, idx=i, maximum=1) for i in range(num_instances)]
+        if num_instances == 0:
+            return self.output
+        if boxes is not None and boxes.shape[1] == 5:
+            return self.overlay_rotated_instances(
+                boxes=boxes, labels=labels, assigned_colors=assigned_colors
+            )
+        # Display in largest to smallest order to reduce occlusion.
+        areas = None
+        if boxes is not None:
+            areas = np.prod(boxes[:, 2:] - boxes[:, :2], axis=1)
+        elif masks is not None:
+            areas = np.asarray([x.area() for x in masks])
+        if areas is not None:
+            sorted_idxs = np.argsort(-areas).tolist()
+            # Re-order overlapped instances in descending order.
+            boxes = boxes[sorted_idxs] if boxes is not None else None
+            labels = [labels[k] for k in sorted_idxs] if labels is not None else None
+            masks = [masks[idx] for idx in sorted_idxs] if masks is not None else None
+            assigned_colors = [assigned_colors[idx] for idx in sorted_idxs]
+            keypoints = keypoints[sorted_idxs] if keypoints is not None else None
+        for i in range(num_instances):
+            color = assigned_colors[i]
+            if boxes is not None:
+                self.draw_box(boxes[i], edge_color=color)
+            if masks is not None:
+                for segment in masks[i].polygons:
+                    self.draw_polygon(segment.reshape(-1, 2), color, alpha=alpha)
+            if labels is not None:
+                # first get a box
+                if boxes is not None:
+                    x0, y0, x1, y1 = boxes[i]
+                    text_pos = (x0, y0)  # if drawing boxes, put text on the box corner.
+                    horiz_align = "left"
+                elif masks is not None:
+                    # skip small mask without polygon
+                    if len(masks[i].polygons) == 0:
+                        continue
+                    x0, y0, x1, y1 = masks[i].bbox()
+                    # draw text in the center (defined by median) when box is not drawn
+                    # median is less sensitive to outliers.
+                    text_pos = np.median(masks[i].mask.nonzero(), axis=1)[::-1]
+                    horiz_align = "center"
+                else:
+                    continue  # drawing the box confidence for keypoints isn't very useful.
+                # for small objects, draw text at the side to avoid occlusion
+                instance_area = (y1 - y0) * (x1 - x0)
+                if (
+                    instance_area < _SMALL_OBJECT_AREA_THRESH * self.output.scale
+                    or y1 - y0 < 40 * self.output.scale
+                ):
+                    if y1 >= self.output.height - 5:
+                        text_pos = (x1, y0)
+                    else:
+                        text_pos = (x0, y1)
+                height_ratio = (y1 - y0) / np.sqrt(self.output.height * self.output.width)
+                lighter_color = self._change_color_brightness(color, brightness_factor=0.7)
+                font_size = (
+                    np.clip((height_ratio - 0.02) / 0.08 + 1, 1.2, 2)
+                    * 0.5
+                    * self._default_font_size
+                )
+                if is_text:
+                    self.draw_text(
+                        labels[i],
+                        text_pos,
+                        color=lighter_color,
+                        horizontal_alignment=horiz_align,
+                        font_size=font_size,
+                    )
+        # draw keypoints
+        if keypoints is not None:
+            for keypoints_per_instance in keypoints:
+                self.draw_and_connect_keypoints(keypoints_per_instance)
+        return self.output
+    def overlay_rotated_instances(self, boxes=None, labels=None, assigned_colors=None):
+        """
+        Args:
+            boxes (ndarray): an Nx5 numpy array of
+                (x_center, y_center, width, height, angle_degrees) format
+                for the N objects in a single image.
+            labels (list[str]): the text to be displayed for each instance.
+            assigned_colors (list[matplotlib.colors]): a list of colors, where each color
+                corresponds to each mask or box in the image. Refer to 'matplotlib.colors'
+                for full list of formats that the colors are accepted in.
+        Returns:
+            output (VisImage): image object with visualizations.
+        """
+        num_instances = len(boxes)
+        if assigned_colors is None:
+            # assigned_colors = [random_color(rgb=True, maximum=1) for _ in range(num_instances)]
+            assigned_colors = [instance_color(rgb=True, idx=i, maximum=1) for i in range(num_instances)]
+        if num_instances == 0:
+            return self.output
+        # Display in largest to smallest order to reduce occlusion.
+        if boxes is not None:
+            areas = boxes[:, 2] * boxes[:, 3]
+        sorted_idxs = np.argsort(-areas).tolist()
+        # Re-order overlapped instances in descending order.
+        boxes = boxes[sorted_idxs]
+        labels = [labels[k] for k in sorted_idxs] if labels is not None else None
+        colors = [assigned_colors[idx] for idx in sorted_idxs]
+        for i in range(num_instances):
+            self.draw_rotated_box_with_label(
+                boxes[i], edge_color=colors[i], label=labels[i] if labels is not None else None
+            )
+        return self.output
+    def draw_and_connect_keypoints(self, keypoints):
+        """
+        Draws keypoints of an instance and follows the rules for keypoint connections
+        to draw lines between appropriate keypoints. This follows color heuristics for
+        line color.
+        Args:
+            keypoints (Tensor): a tensor of shape (K, 3), where K is the number of keypoints
+                and the last dimension corresponds to (x, y, probability).
+        Returns:
+            output (VisImage): image object with visualizations.
+        """
+        visible = {}
+        keypoint_names = self.metadata.get("keypoint_names")
+        for idx, keypoint in enumerate(keypoints):
+            # draw keypoint
+            x, y, prob = keypoint
+            if prob > self.keypoint_threshold:
+                self.draw_circle((x, y), color=_RED)
+                if keypoint_names:
+                    keypoint_name = keypoint_names[idx]
+                    visible[keypoint_name] = (x, y)
+        if self.metadata.get("keypoint_connection_rules"):
+            for kp0, kp1, color in self.metadata.keypoint_connection_rules:
+                if kp0 in visible and kp1 in visible:
+                    x0, y0 = visible[kp0]
+                    x1, y1 = visible[kp1]
+                    color = tuple(x / 255.0 for x in color)
+                    self.draw_line([x0, x1], [y0, y1], color=color)
+        # draw lines from nose to mid-shoulder and mid-shoulder to mid-hip
+        # Note that this strategy is specific to person keypoints.
+        # For other keypoints, it should just do nothing
+        try:
+            ls_x, ls_y = visible["left_shoulder"]
+            rs_x, rs_y = visible["right_shoulder"]
+            mid_shoulder_x, mid_shoulder_y = (ls_x + rs_x) / 2, (ls_y + rs_y) / 2
+        except KeyError:
+            pass
+        else:
+            # draw line from nose to mid-shoulder
+            nose_x, nose_y = visible.get("nose", (None, None))
+            if nose_x is not None:
+                self.draw_line([nose_x, mid_shoulder_x], [nose_y, mid_shoulder_y], color=_RED)
+            try:
+                # draw line from mid-shoulder to mid-hip
+                lh_x, lh_y = visible["left_hip"]
+                rh_x, rh_y = visible["right_hip"]
+            except KeyError:
+                pass
+            else:
+                mid_hip_x, mid_hip_y = (lh_x + rh_x) / 2, (lh_y + rh_y) / 2
+                self.draw_line([mid_hip_x, mid_shoulder_x], [mid_hip_y, mid_shoulder_y], color=_RED)
+        return self.output
+    """
+    Primitive drawing functions:
+    """
+    def draw_text(
+        self,
+        text,
+        position,
+        *,
+        font_size=None,
+        color="g",
+        horizontal_alignment="center",
+        rotation=0,
+    ):
+        """
+        Args:
+            text (str): class label
+            position (tuple): a tuple of the x and y coordinates to place text on image.
+            font_size (int, optional): font of the text. If not provided, a font size
+                proportional to the image width is calculated and used.
+            color: color of the text. Refer to `matplotlib.colors` for full list
+                of formats that are accepted.
+            horizontal_alignment (str): see `matplotlib.text.Text`
+            rotation: rotation angle in degrees CCW
+        Returns:
+            output (VisImage): image object with text drawn.
+        """
+        if not font_size:
+            font_size = self._default_font_size
+        # since the text background is dark, we don't want the text to be dark
+        color = np.maximum(list(mplc.to_rgb(color)), 0.2)
+        color[np.argmax(color)] = max(0.8, np.max(color))
+        x, y = position
+        self.output.ax.text(
+            x,
+            y,
+            text,
+            size=font_size * self.output.scale,
+            family="sans-serif",
+            bbox={"facecolor": "black", "alpha": 0.8, "pad": 0.7, "edgecolor": "none"},
+            verticalalignment="top",
+            horizontalalignment=horizontal_alignment,
+            color=color,
+            zorder=10,
+            rotation=rotation,
+        )
+        return self.output
+    def draw_box(self, box_coord, alpha=1.0, edge_color="g", line_style="-"):
+        """
+        Args:
+            box_coord (tuple): a tuple containing x0, y0, x1, y1 coordinates, where x0 and y0
+                are the coordinates of the image's top left corner. x1 and y1 are the
+                coordinates of the image's bottom right corner.
+            alpha (float): blending efficient. Smaller values lead to more transparent masks.
+            edge_color: color of the outline of the box. Refer to `matplotlib.colors`
+                for full list of formats that are accepted.
+            line_style (string): the string to use to create the outline of the boxes.
+        Returns:
+            output (VisImage): image object with box drawn.
+        """
+        x0, y0, x1, y1 = box_coord
+        width = x1 - x0
+        height = y1 - y0
+        linewidth = 2
+        self.output.ax.add_patch(
+            mpl.patches.Rectangle(
+                (x0, y0),
+                width,
+                height,
+                fill=False,
+                edgecolor=edge_color,
+                linewidth=linewidth * self.output.scale,
+                alpha=alpha,
+                linestyle=line_style,
+            )
+        )
+        return self.output
+    def draw_rotated_box_with_label(
+        self, rotated_box, alpha=0.5, edge_color="g", line_style="-", label=None
+    ):
+        """
+        Draw a rotated box with label on its top-left corner.
+        Args:
+            rotated_box (tuple): a tuple containing (cnt_x, cnt_y, w, h, angle),
+                where cnt_x and cnt_y are the center coordinates of the box.
+                w and h are the width and height of the box. angle represents how
+                many degrees the box is rotated CCW with regard to the 0-degree box.
+            alpha (float): blending efficient. Smaller values lead to more transparent masks.
+            edge_color: color of the outline of the box. Refer to `matplotlib.colors`
+                for full list of formats that are accepted.
+            line_style (string): the string to use to create the outline of the boxes.
+            label (string): label for rotated box. It will not be rendered when set to None.
+        Returns:
+            output (VisImage): image object with box drawn.
+        """
+        cnt_x, cnt_y, w, h, angle = rotated_box
+        area = w * h
+        # use thinner lines when the box is small
+        linewidth = self._default_font_size / (
+            6 if area < _SMALL_OBJECT_AREA_THRESH * self.output.scale else 3
+        )
+        theta = angle * math.pi / 180.0
+        c = math.cos(theta)
+        s = math.sin(theta)
+        rect = [(-w / 2, h / 2), (-w / 2, -h / 2), (w / 2, -h / 2), (w / 2, h / 2)]
+        # x: left->right ; y: top->down
+        rotated_rect = [(s * yy + c * xx + cnt_x, c * yy - s * xx + cnt_y) for (xx, yy) in rect]
+        for k in range(4):
+            j = (k + 1) % 4
+            self.draw_line(
+                [rotated_rect[k][0], rotated_rect[j][0]],
+                [rotated_rect[k][1], rotated_rect[j][1]],
+                color=edge_color,
+                linestyle="--" if k == 1 else line_style,
+                linewidth=linewidth,
+            )
+        if label is not None:
+            text_pos = rotated_rect[1]  # topleft corner
+            height_ratio = h / np.sqrt(self.output.height * self.output.width)
+            label_color = self._change_color_brightness(edge_color, brightness_factor=0.7)
+            font_size = (
+                np.clip((height_ratio - 0.02) / 0.08 + 1, 1.2, 2) * 0.5 * self._default_font_size
+            )
+            self.draw_text(label, text_pos, color=label_color, font_size=font_size, rotation=angle)
+        return self.output
+    def draw_circle(self, circle_coord, color, radius=3):
+        """
+        Args:
+            circle_coord (list(int) or tuple(int)): contains the x and y coordinates
+                of the center of the circle.
+            color: color of the polygon. Refer to `matplotlib.colors` for a full list of
+                formats that are accepted.
+            radius (int): radius of the circle.
+        Returns:
+            output (VisImage): image object with box drawn.
+        """
+        x, y = circle_coord
+        self.output.ax.add_patch(
+            mpl.patches.Circle(circle_coord, radius=radius, fill=True, color=color)
+        )
+        return self.output
+    def draw_line(self, x_data, y_data, color, linestyle="-", linewidth=None):
+        """
+        Args:
+            x_data (list[int]): a list containing x values of all the points being drawn.
+                Length of list should match the length of y_data.
+            y_data (list[int]): a list containing y values of all the points being drawn.
+                Length of list should match the length of x_data.
+            color: color of the line. Refer to `matplotlib.colors` for a full list of
+                formats that are accepted.
+            linestyle: style of the line. Refer to `matplotlib.lines.Line2D`
+                for a full list of formats that are accepted.
+            linewidth (float or None): width of the line. When it's None,
+                a default value will be computed and used.
+        Returns:
+            output (VisImage): image object with line drawn.
+        """
+        if linewidth is None:
+            linewidth = self._default_font_size / 3
+        linewidth = max(linewidth, 1)
+        self.output.ax.add_line(
+            mpl.lines.Line2D(
+                x_data,
+                y_data,
+                linewidth=linewidth * self.output.scale,
+                color=color,
+                linestyle=linestyle,
+            )
+        )
+        return self.output
+    def draw_binary_mask(
+        self, binary_mask, color=None, *, edge_color=None, text=None, alpha=0.5, area_threshold=10, is_text=True,
+    ):
+        """
+        Args:
+            binary_mask (ndarray): numpy array of shape (H, W), where H is the image height and
+                W is the image width. Each value in the array is either a 0 or 1 value of uint8
+                type.
+            color: color of the mask. Refer to `matplotlib.colors` for a full list of
+                formats that are accepted. If None, will pick a random color.
+            edge_color: color of the polygon edges. Refer to `matplotlib.colors` for a
+                full list of formats that are accepted.
+            text (str): if None, will be drawn on the object
+            alpha (float): blending efficient. Smaller values lead to more transparent masks.
+            area_threshold (float): a connected component smaller than this area will not be shown.
+        Returns:
+            output (VisImage): image object with mask drawn.
+        """
+        if color is None:
+            color = random_color(rgb=True, maximum=1)
+        color = mplc.to_rgb(color)
+        has_valid_segment = False
+        binary_mask = binary_mask.astype("uint8")  # opencv needs uint8
+        mask = GenericMask(binary_mask, self.output.height, self.output.width)
+        shape2d = (binary_mask.shape[0], binary_mask.shape[1])
+        if not mask.has_holes:
+            # draw polygons for regular masks
+            for segment in mask.polygons:
+                area = mask_util.area(mask_util.frPyObjects([segment], shape2d[0], shape2d[1]))
+                if area < (area_threshold or 0):
+                    continue
+                has_valid_segment = True
+                segment = segment.reshape(-1, 2)
+                self.draw_polygon(segment, color=color, edge_color=edge_color, alpha=alpha)
+        else:
+            # TODO: Use Path/PathPatch to draw vector graphics:
+            # https://stackoverflow.com/questions/8919719/how-to-plot-a-complex-polygon
+            rgba = np.zeros(shape2d + (4,), dtype="float32")
+            rgba[:, :, :3] = color
+            rgba[:, :, 3] = (mask.mask == 1).astype("float32") * alpha
+            has_valid_segment = True
+            self.output.ax.imshow(rgba, extent=(0, self.output.width, self.output.height, 0))
+        if is_text:
+            if text is not None and has_valid_segment:
+                lighter_color = self._change_color_brightness(color, brightness_factor=0.7)
+                self._draw_text_in_mask(binary_mask, text, lighter_color)
+        return self.output
+    def draw_soft_mask(self, soft_mask, color=None, *, text=None, alpha=0.5):
+        """
+        Args:
+            soft_mask (ndarray): float array of shape (H, W), each value in [0, 1].
+            color: color of the mask. Refer to `matplotlib.colors` for a full list of
+                formats that are accepted. If None, will pick a random color.
+            text (str): if None, will be drawn on the object
+            alpha (float): blending efficient. Smaller values lead to more transparent masks.
+        Returns:
+            output (VisImage): image object with mask drawn.
+        """
+        if color is None:
+            color = random_color(rgb=True, maximum=1)
+        color = mplc.to_rgb(color)
+        shape2d = (soft_mask.shape[0], soft_mask.shape[1])
+        rgba = np.zeros(shape2d + (4,), dtype="float32")
+        rgba[:, :, :3] = color
+        rgba[:, :, 3] = soft_mask * alpha
+        self.output.ax.imshow(rgba, extent=(0, self.output.width, self.output.height, 0))
+        if text is not None:
+            lighter_color = self._change_color_brightness(color, brightness_factor=0.7)
+            binary_mask = (soft_mask > 0.5).astype("uint8")
+            # self._draw_text_in_mask(binary_mask, text, lighter_color)
+        return self.output
+    def draw_polygon(self, segment, color, edge_color=None, alpha=0.5):
+        """
+        Args:
+            segment: numpy array of shape Nx2, containing all the points in the polygon.
+            color: color of the polygon. Refer to `matplotlib.colors` for a full list of
+                formats that are accepted.
+            edge_color: color of the polygon edges. Refer to `matplotlib.colors` for a
+                full list of formats that are accepted. If not provided, a darker shade
+                of the polygon color will be used instead.
+            alpha (float): blending efficient. Smaller values lead to more transparent masks.
+        Returns:
+            output (VisImage): image object with polygon drawn.
+        """
+        if edge_color is None:
+            # make edge color darker than the polygon color
+            if alpha > 0.8:
+                edge_color = self._change_color_brightness(color, brightness_factor=-0.7)
+            else:
+                edge_color = color
+        edge_color = mplc.to_rgb(edge_color) + (1,)
+        polygon = mpl.patches.Polygon(
+            segment,
+            fill=True,
+            facecolor=mplc.to_rgb(color) + (alpha,),
+            edgecolor=edge_color,
+            linewidth=max(self._default_font_size // 15 * self.output.scale, 1),
+        )
+        self.output.ax.add_patch(polygon)
+        return self.output
+    """
+    Internal methods:
+    """
+    def _jitter(self, color):
+        """
+        Randomly modifies given color to produce a slightly different color than the color given.
+        Args:
+            color (tuple[double]): a tuple of 3 elements, containing the RGB values of the color
+                picked. The values in the list are in the [0.0, 1.0] range.
+        Returns:
+            jittered_color (tuple[double]): a tuple of 3 elements, containing the RGB values of the
+                color after being jittered. The values in the list are in the [0.0, 1.0] range.
+        """
+        color = mplc.to_rgb(color)
+        vec = np.random.rand(3)
+        # better to do it in another color space
+        vec = vec / np.linalg.norm(vec) * 0.5
+        res = np.clip(vec + color, 0, 1)
+        return tuple(res)
+    def _create_grayscale_image(self, mask=None):
+        """
+        Create a grayscale version of the original image.
+        The colors in masked area, if given, will be kept.
+        """
+        img_bw = self.img.astype("f4").mean(axis=2)
+        img_bw = np.stack([img_bw] * 3, axis=2)
+        if mask is not None:
+            img_bw[mask] = self.img[mask]
+        return img_bw
+    def _change_color_brightness(self, color, brightness_factor):
+        """
+        Depending on the brightness_factor, gives a lighter or darker color i.e. a color with
+        less or more saturation than the original color.
+        Args:
+            color: color of the polygon. Refer to `matplotlib.colors` for a full list of
+                formats that are accepted.
+            brightness_factor (float): a value in [-1.0, 1.0] range. A lightness factor of
+                0 will correspond to no change, a factor in [-1.0, 0) range will result in
+                a darker color and a factor in (0, 1.0] range will result in a lighter color.
+        Returns:
+            modified_color (tuple[double]): a tuple containing the RGB values of the
+                modified color. Each value in the tuple is in the [0.0, 1.0] range.
+        """
+        assert brightness_factor >= -1.0 and brightness_factor <= 1.0
+        color = mplc.to_rgb(color)
+        polygon_color = colorsys.rgb_to_hls(*mplc.to_rgb(color))
+        modified_lightness = polygon_color[1] + (brightness_factor * polygon_color[1])
+        modified_lightness = 0.0 if modified_lightness < 0.0 else modified_lightness
+        modified_lightness = 1.0 if modified_lightness > 1.0 else modified_lightness
+        modified_color = colorsys.hls_to_rgb(polygon_color[0], modified_lightness, polygon_color[2])
+        return modified_color
+    def _convert_boxes(self, boxes):
+        """
+        Convert different format of boxes to an NxB array, where B = 4 or 5 is the box dimension.
+        """
+        if isinstance(boxes, Boxes) or isinstance(boxes, RotatedBoxes):
+            return boxes.tensor.detach().numpy()
+        else:
+            return np.asarray(boxes)
+    def _convert_masks(self, masks_or_polygons):
+        """
+        Convert different format of masks or polygons to a tuple of masks and polygons.
+        Returns:
+            list[GenericMask]:
+        """
+        m = masks_or_polygons
+        if isinstance(m, PolygonMasks):
+            m = m.polygons
+        if isinstance(m, BitMasks):
+            m = m.tensor.numpy()
+        if isinstance(m, torch.Tensor):
+            m = m.numpy()
+        ret = []
+        for x in m:
+            if isinstance(x, GenericMask):
+                ret.append(x)
+            else:
+                ret.append(GenericMask(x, self.output.height, self.output.width))
+        return ret
+    def _draw_text_in_mask(self, binary_mask, text, color):
+        """
+        Find proper places to draw text given a binary mask.
+        """
+        # TODO sometimes drawn on wrong objects. the heuristics here can improve.
+        _num_cc, cc_labels, stats, centroids = cv2.connectedComponentsWithStats(binary_mask, 8)
+        if stats[1:, -1].size == 0:
+            return
+        largest_component_id = np.argmax(stats[1:, -1]) + 1
+        # draw text on the largest component, as well as other very large components.
+        for cid in range(1, _num_cc):
+            if cid == largest_component_id or stats[cid, -1] > _LARGE_MASK_AREA_THRESH:
+                # median is more stable than centroid
+                # center = centroids[largest_component_id]
+                center = np.median((cc_labels == cid).nonzero(), axis=1)[::-1]
+                self.draw_text(text, center, color=color)
+    def _convert_keypoints(self, keypoints):
+        if isinstance(keypoints, Keypoints):
+            keypoints = keypoints.tensor
+        keypoints = np.asarray(keypoints)
+        return keypoints
+    def get_output(self):
+        """
+        Returns:
+            output (VisImage): the image output containing the visualizations added
+            to the image.
+        """
+        return self.output

gradio_test.py ADDED Viewed

	@@ -0,0 +1,757 @@

+import torch
+import numpy as np
+from PIL import Image
+import cv2
+import os
+import sys
+import time
+import logging
+from pathlib import Path
+from typing import Tuple, Dict, List, Optional, Union
+import gradio as gr
+from huggingface_hub import hf_hub_download
+import warnings
+warnings.filterwarnings("ignore")
+from detectron2.config import get_cfg
+from detectron2.projects.deeplab import add_deeplab_config
+from detectron2.data import MetadataCatalog
+from detectron2.engine.defaults import DefaultPredictor
+from detectron2 import model_zoo
+from detectron2.utils.visualizer import Visualizer, ColorMode
+try:
+    from oneformer import (
+        add_oneformer_config,
+        add_common_config,
+        add_swin_config,
+        add_dinat_config,
+    )
+    from demo.defaults import DefaultPredictor as OneFormerPredictor
+    ONEFORMER_AVAILABLE = True
+except ImportError as e:
+    print(f"OneFormer not available: {e}")
+    ONEFORMER_AVAILABLE = False
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+CPU_DEVICE = torch.device("cpu")
+torch.set_num_threads(4)
+FLOOR_CLASSES = {
+    'floor': [3, 4, 13],
+    'carpet': [28],
+    'mat': [78],
+}
+ONEFORMER_CONFIG = {
+    "ADE20K": {
+        "key": "ade20k",
+        "swin_cfg": "configs/ade20k/oneformer_swin_large_IN21k_384_bs16_160k.yaml",
+        "swin_model": "shi-labs/oneformer_ade20k_swin_large",
+        "swin_file": "250_16_swin_l_oneformer_ade20k_160k.pth",
+        "process_size": 640,
+        "max_size": 2560
+    }
+}
+BLACKSPOT_MODEL_REPO = "sww35/neuronest-blackspot"
+BLACKSPOT_MODEL_FILE = "model_0004999.pth"
+DISPLAY_MAX_WIDTH = 1920
+DISPLAY_MAX_HEIGHT = 1080
+from universal_contrast_analyzer import UniversalContrastAnalyzer
+def resize_image_for_processing(image: np.ndarray, target_size: int = 640, max_size: int = 2560) -> Tuple[np.ndarray, float]:
+    h, w = image.shape[:2]
+    scale = target_size / min(h, w)
+    if scale * max(h, w) > max_size:
+        scale = max_size / max(h, w)
+    new_w = int(w * scale)
+    new_h = int(h * scale)
+    new_w = (new_w // 32) * 32
+    new_h = (new_h // 32) * 32
+    resized = cv2.resize(image, (new_w, new_h), interpolation=cv2.INTER_LANCZOS4)
+    return resized, scale
+def resize_mask_to_original(mask: np.ndarray, original_size: Tuple[int, int]) -> np.ndarray:
+    return cv2.resize(mask.astype(np.uint8), (original_size[1], original_size[0]), interpolation=cv2.INTER_NEAREST)
+def prepare_display_image(image: np.ndarray, max_width: int = DISPLAY_MAX_WIDTH, max_height: int = DISPLAY_MAX_HEIGHT) -> np.ndarray:
+    h, w = image.shape[:2]
+    scale = 1.0
+    if w > max_width:
+        scale = max_width / w
+    if h * scale > max_height:
+        scale = max_height / h
+    if scale < 1.0:
+        new_w = int(w * scale)
+        new_h = int(h * scale)
+        return cv2.resize(image, (new_w, new_h), interpolation=cv2.INTER_LANCZOS4)
+    return image
+class OneFormerManager:
+    def __init__(self):
+        self.predictor = None
+        self.metadata = None
+        self.initialized = False
+        self.process_size = ONEFORMER_CONFIG["ADE20K"]["process_size"]
+        self.max_size = ONEFORMER_CONFIG["ADE20K"]["max_size"]
+    def initialize(self, backbone: str = "swin"):
+        if not ONEFORMER_AVAILABLE:
+            logger.error("OneFormer not available")
+            return False
+        try:
+            cfg = get_cfg()
+            add_deeplab_config(cfg)
+            add_common_config(cfg)
+            add_swin_config(cfg)
+            add_oneformer_config(cfg)
+            add_dinat_config(cfg)
+            config = ONEFORMER_CONFIG["ADE20K"]
+            cfg.merge_from_file(config["swin_cfg"])
+            cfg.MODEL.DEVICE = DEVICE
+            model_path = hf_hub_download(
+                repo_id=config["swin_model"],
+                filename=config["swin_file"]
+            )
+            cfg.MODEL.WEIGHTS = model_path
+            cfg.freeze()
+            self.predictor = OneFormerPredictor(cfg)
+            self.metadata = MetadataCatalog.get(
+                cfg.DATASETS.TEST_PANOPTIC[0] if len(cfg.DATASETS.TEST_PANOPTIC) else "__unused"
+            )
+            self.initialized = True
+            logger.info("OneFormer initialized successfully")
+            return True
+        except Exception as e:
+            logger.error(f"Failed to initialize OneFormer: {e}")
+            return False
+    def semantic_segmentation(self, image: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+        if not self.initialized:
+            raise RuntimeError("OneFormer not initialized")
+        original_size = (image.shape[0], image.shape[1])
+        image_processed, scale = resize_image_for_processing(image, self.process_size, self.max_size)
+        logger.info(f"Processing image at {image_processed.shape}, scale: {scale}")
+        predictions = self.predictor(image_processed, "semantic")
+        seg_mask_processed = predictions["sem_seg"].argmax(dim=0).cpu().numpy()
+        seg_mask_original = resize_mask_to_original(seg_mask_processed, original_size)
+        visualizer = Visualizer(
+            image[:, :, ::-1],
+            metadata=self.metadata,
+            instance_mode=ColorMode.IMAGE,
+            scale=1.0
+        )
+        vis_output = visualizer.draw_sem_seg(seg_mask_original, alpha=0.6)
+        vis_image = vis_output.get_image()[:, :, ::-1]
+        vis_image_display = prepare_display_image(vis_image)
+        return seg_mask_original, vis_image_display
+    def extract_floor_areas(self, segmentation: np.ndarray) -> np.ndarray:
+        floor_mask = np.zeros_like(segmentation, dtype=bool)
+        for class_ids in FLOOR_CLASSES.values():
+            for class_id in class_ids:
+                floor_mask |= (segmentation == class_id)
+        return floor_mask
+class ImprovedBlackspotDetector:
+    def __init__(self, model_path: str = None):
+        self.model_path = model_path
+        self.predictor = None
+        self.floor_classes = [3, 4, 13, 28, 78]
+    def download_model(self) -> str:
+        try:
+            model_path = hf_hub_download(
+                repo_id=BLACKSPOT_MODEL_REPO,
+                filename=BLACKSPOT_MODEL_FILE
+            )
+            logger.info(f"Downloaded blackspot model to: {model_path}")
+            return model_path
+        except Exception as e:
+            logger.warning(f"Could not download blackspot model from HF: {e}")
+            local_path = f"./output_floor_blackspot/{BLACKSPOT_MODEL_FILE}"
+            if os.path.exists(local_path):
+                logger.info(f"Using local blackspot model: {local_path}")
+                return local_path
+            return None
+    def initialize(self, threshold: float = 0.5) -> bool:
+        try:
+            if self.model_path is None:
+                self.model_path = self.download_model()
+            if self.model_path is None:
+                logger.error("No blackspot model available")
+                return False
+            cfg = get_cfg()
+            cfg.merge_from_file(
+                model_zoo.get_config_file("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml")
+            )
+            cfg.MODEL.ROI_HEADS.NUM_CLASSES = 2
+            cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = threshold
+            cfg.MODEL.WEIGHTS = self.model_path
+            cfg.MODEL.DEVICE = DEVICE
+            self.predictor = DefaultPredictor(cfg)
+            logger.info("MaskRCNN blackspot detector initialized")
+            return True
+        except Exception as e:
+            logger.error(f"Failed to initialize blackspot detector: {e}")
+            return False
+    def is_on_floor_surface(
+        self,
+        blackspot_mask: np.ndarray,
+        segmentation: np.ndarray,
+        floor_mask: np.ndarray,
+        overlap_threshold: float = 0.8
+    ) -> bool:
+        if np.sum(blackspot_mask) == 0:
+            return False
+        overlap = blackspot_mask & floor_mask
+        overlap_ratio = np.sum(overlap) / np.sum(blackspot_mask)
+        if overlap_ratio < overlap_threshold:
+            return False
+        blackspot_pixels = segmentation[blackspot_mask]
+        if len(blackspot_pixels) == 0:
+            return False
+        unique_classes, counts = np.unique(blackspot_pixels, return_counts=True)
+        floor_pixel_count = sum(
+            counts[unique_classes == cls] for cls in self.floor_classes if cls in unique_classes
+        )
+        floor_ratio = floor_pixel_count / len(blackspot_pixels)
+        return floor_ratio > 0.7
+    def filter_non_floor_blackspots(
+        self,
+        blackspot_masks: List[np.ndarray],
+        segmentation: np.ndarray,
+        floor_mask: np.ndarray
+    ) -> List[np.ndarray]:
+        filtered_masks = []
+        for mask in blackspot_masks:
+            if self.is_on_floor_surface(mask, segmentation, floor_mask):
+                filtered_masks.append(mask)
+            else:
+                logger.debug(f"Filtered out non-floor blackspot with area {np.sum(mask)}")
+        return filtered_masks
+    def detect_blackspots(
+        self,
+        image: np.ndarray,
+        segmentation: np.ndarray,
+        floor_prior: Optional[np.ndarray] = None
+    ) -> Dict:
+        if self.predictor is None:
+            raise RuntimeError("Blackspot detector not initialized")
+        original_h, original_w = image.shape[:2]
+        if floor_prior is not None and floor_prior.shape != (original_h, original_w):
+            floor_prior = cv2.resize(
+                floor_prior.astype(np.uint8),
+                (original_w, original_h),
+                interpolation=cv2.INTER_NEAREST
+            ).astype(bool)
+        if segmentation.shape != (original_h, original_w):
+            segmentation = cv2.resize(
+                segmentation.astype(np.uint8),
+                (original_w, original_h),
+                interpolation=cv2.INTER_NEAREST
+            )
+        try:
+            outputs = self.predictor(image)
+            instances = outputs["instances"].to("cpu")
+        except Exception as e:
+            logger.error(f"Error in MaskRCNN prediction: {e}")
+            return self._empty_results(image)
+        if len(instances) == 0:
+            return self._empty_results(image)
+        pred_classes = instances.pred_classes.numpy()
+        pred_masks = instances.pred_masks.numpy()
+        scores = instances.scores.numpy()
+        blackspot_indices = pred_classes == 1
+        blackspot_masks = pred_masks[blackspot_indices] if np.any(blackspot_indices) else []
+        blackspot_scores = scores[blackspot_indices] if np.any(blackspot_indices) else []
+        if floor_prior is not None:
+            floor_mask = floor_prior
+        else:
+            floor_mask = np.zeros(segmentation.shape, dtype=bool)
+            for cls in self.floor_classes:
+                floor_mask |= (segmentation == cls)
+        filtered_blackspot_masks = self.filter_non_floor_blackspots(
+            blackspot_masks, segmentation, floor_mask
+        )
+        combined_blackspot = np.zeros(image.shape[:2], dtype=bool)
+        for mask in filtered_blackspot_masks:
+            combined_blackspot |= mask
+        visualization = self.create_visualization(image, floor_mask, combined_blackspot)
+        visualization_display = prepare_display_image(visualization)
+        floor_area = int(np.sum(floor_mask))
+        blackspot_area = int(np.sum(combined_blackspot))
+        coverage_percentage = (blackspot_area / floor_area * 100) if floor_area > 0 else 0
+        return {
+            'visualization': visualization_display,
+            'floor_mask': floor_mask,
+            'blackspot_mask': combined_blackspot,
+            'floor_area': floor_area,
+            'blackspot_area': blackspot_area,
+            'coverage_percentage': coverage_percentage,
+            'num_detections': len(filtered_blackspot_masks),
+            'avg_confidence': float(np.mean(blackspot_scores)) if len(blackspot_scores) > 0 else 0.0
+        }
+    def create_visualization(
+        self,
+        image: np.ndarray,
+        floor_mask: np.ndarray,
+        blackspot_mask: np.ndarray
+    ) -> np.ndarray:
+        vis = image.copy()
+        floor_overlay = vis.copy()
+        floor_overlay[floor_mask] = [0, 255, 0]
+        vis = cv2.addWeighted(vis, 0.7, floor_overlay, 0.3, 0)
+        vis[blackspot_mask] = [255, 0, 0]
+        blackspot_contours, _ = cv2.findContours(
+            blackspot_mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
+        )
+        cv2.drawContours(vis, blackspot_contours, -1, (255, 255, 0), 4)
+        return vis
+    def _empty_results(self, image: np.ndarray) -> Dict:
+        empty_mask = np.zeros(image.shape[:2], dtype=bool)
+        visualization_display = prepare_display_image(image)
+        return {
+            'visualization': visualization_display,
+            'floor_mask': empty_mask,
+            'blackspot_mask': empty_mask,
+            'floor_area': 0,
+            'blackspot_area': 0,
+            'coverage_percentage': 0,
+            'num_detections': 0,
+            'avg_confidence': 0.0
+        }
+class NeuroNestApp:
+    def __init__(self):
+        self.oneformer = OneFormerManager()
+        self.blackspot_detector = None
+        self.contrast_analyzer = UniversalContrastAnalyzer(wcag_threshold=4.5)
+        self.initialized = False
+    def initialize(self):
+        logger.info("Initializing NeuroNest application...")
+        oneformer_success = self.oneformer.initialize()
+        blackspot_success = False
+        try:
+            self.blackspot_detector = ImprovedBlackspotDetector()
+            blackspot_success = self.blackspot_detector.initialize()
+        except Exception as e:
+            logger.warning(f"Could not initialize blackspot detector: {e}")
+        self.initialized = oneformer_success
+        return oneformer_success, blackspot_success
+    def analyze_image(
+        self,
+        image_path: str,
+        blackspot_threshold: float = 0.5,
+        contrast_threshold: float = 4.5,
+        enable_blackspot: bool = True,
+        enable_contrast: bool = True
+    ) -> Dict:
+        if not self.initialized:
+            return {"error": "Application not properly initialized"}
+        try:
+            image = cv2.imread(image_path, cv2.IMREAD_COLOR)
+            if image is None:
+                return {"error": "Could not load image"}
+            image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+            logger.info(f"Loaded image with shape: {image_rgb.shape}")
+            results = {
+                'original_image': image_rgb,
+                'segmentation': None,
+                'blackspot': None,
+                'contrast': None,
+                'statistics': {}
+            }
+            logger.info("Running semantic segmentation...")
+            seg_mask, seg_visualization = self.oneformer.semantic_segmentation(image_rgb)
+            results['segmentation'] = {
+                'visualization': seg_visualization,
+                'mask': seg_mask
+            }
+            floor_prior = self.oneformer.extract_floor_areas(seg_mask)
+            if enable_blackspot and self.blackspot_detector is not None:
+                logger.info("Running blackspot detection...")
+                try:
+                    blackspot_results = self.blackspot_detector.detect_blackspots(
+                        image_rgb, seg_mask, floor_prior
+                    )
+                    results['blackspot'] = blackspot_results
+                    logger.info("Blackspot detection completed")
+                except Exception as e:
+                    logger.error(f"Error in blackspot detection: {e}")
+                    results['blackspot'] = None
+            if enable_contrast:
+                logger.info("Running universal contrast analysis...")
+                try:
+                    contrast_results = self.contrast_analyzer.analyze_contrast(
+                        image_rgb, seg_mask
+                    )
+                    contrast_viz_display = prepare_display_image(contrast_results['visualization'])
+                    contrast_results['visualization'] = contrast_viz_display
+                    results['contrast'] = contrast_results
+                    logger.info("Contrast analysis completed")
+                except Exception as e:
+                    logger.error(f"Error in contrast analysis: {e}")
+                    results['contrast'] = None
+            stats = self._generate_statistics(results)
+            results['statistics'] = stats
+            logger.info("Image analysis completed successfully")
+            return results
+        except Exception as e:
+            logger.error(f"Error in image analysis: {e}")
+            import traceback
+            traceback.print_exc()
+            return {"error": f"Analysis failed: {str(e)}"}
+    def _generate_statistics(self, results: Dict) -> Dict:
+        stats = {}
+        if results['segmentation']:
+            unique_classes = np.unique(results['segmentation']['mask'])
+            stats['segmentation'] = {
+                'num_classes': len(unique_classes),
+                'image_size': results['segmentation']['mask'].shape
+            }
+        if results['blackspot']:
+            bs = results['blackspot']
+            stats['blackspot'] = {
+                'floor_area_pixels': bs['floor_area'],
+                'blackspot_area_pixels': bs['blackspot_area'],
+                'coverage_percentage': bs['coverage_percentage'],
+                'num_detections': bs['num_detections'],
+                'avg_confidence': bs['avg_confidence']
+            }
+        if results['contrast']:
+            cs = results['contrast']['statistics']
+            stats['contrast'] = {
+                'total_segments': cs.get('total_segments', 0),
+                'analyzed_pairs': cs.get('analyzed_pairs', 0),
+                'low_contrast_pairs': cs.get('low_contrast_pairs', 0),
+                'critical_issues': cs.get('critical_issues', 0),
+                'high_priority_issues': cs.get('high_priority_issues', 0),
+                'medium_priority_issues': cs.get('medium_priority_issues', 0),
+                'floor_object_issues': cs.get('floor_object_issues', 0)
+            }
+        return stats
+def create_gradio_interface():
+    app = NeuroNestApp()
+    oneformer_ok, blackspot_ok = app.initialize()
+    if not oneformer_ok:
+        raise RuntimeError("Failed to initialize OneFormer")
+    # Define sample images
+    SAMPLE_IMAGES = [
+        "samples/example1.png",
+        "samples/example2.png",
+        "samples/example3.png"
+    ]
+    # Check if sample images exist
+    sample_images_available = all(os.path.exists(img) for img in SAMPLE_IMAGES)
+    def analyze_wrapper(
+        image_path,
+        blackspot_threshold,
+        contrast_threshold,
+        enable_blackspot,
+        enable_contrast
+    ):
+        if image_path is None:
+            return None, None, None, "Please upload an image"
+        results = app.analyze_image(
+            image_path=image_path,
+            blackspot_threshold=blackspot_threshold,
+            contrast_threshold=contrast_threshold,
+            enable_blackspot=enable_blackspot,
+            enable_contrast=enable_contrast
+        )
+        if "error" in results:
+            return None, None, None, f"Error: {results['error']}"
+        seg_output = results['segmentation']['visualization'] if results['segmentation'] else None
+        blackspot_output = results['blackspot']['visualization'] if results['blackspot'] else None
+        contrast_output = results['contrast']['visualization'] if results['contrast'] else None
+        if results['contrast']:
+            contrast_report = app.contrast_analyzer.generate_report(results['contrast'])
+        else:
+            contrast_report = "Contrast analysis not performed."
+        if results['blackspot']:
+            bs = results['blackspot']
+            blackspot_report = (
+                f"**Floor Area:** {bs['floor_area']:,} pixels  \n"
+                f"**Blackspot Area:** {bs['blackspot_area']:,} pixels  \n"
+                f"**Coverage:** {bs['coverage_percentage']:.2f}%  \n"
+                f"**Detections:** {bs['num_detections']}  \n"
+                f"**Average Confidence:** {bs['avg_confidence']:.2f}"
+            )
+        else:
+            blackspot_report = "Blackspot analysis not performed."
+        report = generate_comprehensive_report(results, contrast_report, blackspot_report)
+        return seg_output, blackspot_output, contrast_output, report
+    def generate_comprehensive_report(results: Dict, contrast_report: str, blackspot_report: str) -> str:
+        report = ["# 🧠 NeuroNest Analysis Report\n"]
+        report.append(f"*Generated: {time.strftime('%Y-%m-%d %H:%M:%S')}*\n")
+        if results['segmentation']:
+            stats = results['statistics'].get('segmentation', {})
+            report.append("## 🎯 Object Segmentation")
+            report.append(f"- **Classes detected:** {stats.get('num_classes', 'N/A')}")
+            report.append(f"- **Resolution:** {stats.get('image_size', 'N/A')}")
+            report.append("")
+        report.append("## ⚫ Blackspot Analysis")
+        report.append(blackspot_report)
+        report.append("")
+        report.append("## 🎨 Universal Contrast Analysis")
+        report.append(contrast_report)
+        report.append("")
+        report.append("## 📋 Recommendations for Alzheimer's Care")
+        has_issues = False
+        if results['blackspot'] and results['statistics']['blackspot']['coverage_percentage'] > 0:
+            has_issues = True
+            report.append("\n### Blackspot Mitigation:")
+            report.append("- Replace dark flooring materials with lighter alternatives")
+            report.append("- Install additional lighting in affected areas")
+            report.append("- Use light-colored rugs or runners to cover dark spots")
+            report.append("- Add contrasting tape or markers around blackspot perimeters")
+        if results['contrast'] and results['statistics']['contrast']['low_contrast_pairs'] > 0:
+            has_issues = True
+            report.append("\n### Contrast Improvements:")
+            contrast_issues = results['contrast']['issues']
+            critical_issues = [i for i in contrast_issues if i['severity'] == 'critical']
+            high_issues = [i for i in contrast_issues if i['severity'] == 'high']
+            if critical_issues:
+                report.append("\n**CRITICAL - Immediate attention required:**")
+                for issue in critical_issues[:3]:
+                    cat1, cat2 = issue['categories']
+                    report.append(f"- {cat1.title()} ↔ {cat2.title()}: Increase contrast to 7:1 minimum")
+            if high_issues:
+                report.append("\n**HIGH PRIORITY:**")
+                for issue in high_issues[:3]:
+                    cat1, cat2 = issue['categories']
+                    report.append(f"- {cat1.title()} ↔ {cat2.title()}: Increase contrast to 4.5:1 minimum")
+            report.append("\n**General recommendations:**")
+            report.append("- Paint furniture in colors that contrast with floors/walls")
+            report.append("- Add colored tape or markers to furniture edges")
+            report.append("- Install LED strip lighting under furniture edges")
+            report.append("- Use contrasting placemats, cushions, or covers")
+        if not has_issues:
+            report.append("\n✅ **Excellent!** This environment appears well-optimized for individuals with Alzheimer's.")
+            report.append("No significant visual hazards detected.")
+        return "\n".join(report)
+    title = "🧠 NeuroNest: AI-Powered Environment Safety Analysis"
+    description = """
+    **This is the backend of NeuroNest - an object detection and visual analysis application intended to improve the lives of those affected by Alzheimers.**
+    **Texas State CS && Interior Design Dept. - Abheek Pradhan, Dr. Nadim Adi, Dr. Greg Lakomski**
+    This system provides:
+    - **Object Segmentation**: Identifies all room elements (floors, walls, furniture)
+    - **Floor-Only Blackspot Detection**: Locates dangerous dark areas on walking surfaces
+    - **Universal Contrast Analysis**: Evaluates visibility between ALL adjacent objects
+    *Following WCAG 2.1 guidelines for visual accessibility  | Upload a Picture. Click 'Analyze Environment'.Then scroll down.*
+    """
+    with gr.Blocks(css="""
+        .container { max-width: 100%; margin: auto; padding: 20px; }
+        .image-output { margin: 20px 0; }
+        .image-output img {
+            width: 100%;
+            height: auto;
+            max-width: 1920px;
+            margin: 0 auto;
+            display: block;
+            border: 1px solid #ddd;
+            border-radius: 8px;
+        }
+        .controls-row { margin-bottom: 30px; background: #f5f5f5; padding: 20px; border-radius: 8px; }
+        .main-button { height: 80px !important; font-size: 1.3em !important; font-weight: bold !important; }
+        .report-box { max-width: 1200px; margin: 30px auto; padding: 30px; background: #f9f9f9; border-radius: 8px; }
+        h2 { margin-top: 40px; margin-bottom: 20px; color: #333; }
+        .sample-section {
+            margin-bottom: 30px;
+            padding: 20px;
+            background: #fafafa;
+            border-radius: 12px;
+            border: 1px solid #e0e0e0;
+        }
+        .examples-holder .examples-table {
+            display: flex !important;
+            justify-content: center !important;
+            gap: 20px !important;
+            margin-top: 15px !important;
+        }
+        .examples-holder img {
+            border-radius: 8px;
+            cursor: pointer;
+            transition: transform 0.2s, box-shadow 0.2s;
+            border: 2px solid transparent;
+        }
+        .examples-holder img:hover {
+            transform: scale(1.05);
+            box-shadow: 0 4px 12px rgba(0,0,0,0.15);
+            border: 2px solid #4A90E2;
+        }
+    """, theme=gr.themes.Base()) as interface:
+        with gr.Column(elem_classes="container"):
+            gr.Markdown(f"# {title}")
+            gr.Markdown(description)
+            if not blackspot_ok:
+                gr.Markdown("""
+                ⚠️ **Note:** Blackspot detection model not available.
+                To enable blackspot detection, upload the model to HuggingFace or ensure it's in the local directory.
+                """)
+            # First create a hidden image input that will be used by Examples
+            with gr.Row(visible=False):
+                image_input = gr.Image(
+                    label="📸 Upload Room Image",
+                    type="filepath",
+                    height=500
+                )
+            # Sample images section at the top with the actual clickable examples
+            with gr.Column(elem_classes="sample-section"):
+                gr.Markdown("### 🖼️ Try Sample Images")
+                gr.Markdown("*Click any image below to load it for analysis or upload your own. || Then scroll down and click analyze environment*")
+                if sample_images_available:
+                    gr.Examples(
+                        examples=SAMPLE_IMAGES,
+                        inputs=image_input,
+                        label="",
+                        examples_per_page=3
+                    )
+                else:
+                    gr.Markdown("*Sample images not found in samples/ directory*")
+            with gr.Row(elem_classes="controls-row"):
+                with gr.Column(scale=1):
+                    enable_blackspot = gr.Checkbox(
+                        value=blackspot_ok,
+                        label="Enable Floor Blackspot Detection",
+                        interactive=blackspot_ok
+                    )
+                    blackspot_threshold = gr.Slider(
+                        minimum=0.1,
+                        maximum=0.9,
+                        value=0.5,
+                        step=0.05,
+                        label="Blackspot Sensitivity",
+                        visible=blackspot_ok
+                    )
+                with gr.Column(scale=1):
+                    enable_contrast = gr.Checkbox(
+                        value=True,
+                        label="Enable Universal Contrast Analysis"
+                    )
+                    contrast_threshold = gr.Slider(
+                        minimum=3.0,
+                        maximum=7.0,
+                        value=4.5,
+                        step=0.1,
+                        label="WCAG Contrast Threshold"
+                    )
+            with gr.Row():
+                with gr.Column(scale=2):
+                    # Now show the actual visible image input
+                    image_input_display = gr.Image(
+                        label="📸 Upload Room Image",
+                        type="filepath",
+                        height=500
+                    )
+                    # Connect the hidden input to the visible one
+                    image_input.change(
+                        fn=lambda x: x,
+                        inputs=image_input,
+                        outputs=image_input_display
+                    )
+                with gr.Column(scale=1):
+                    analyze_button = gr.Button(
+                        "🔍 Analyze Environment",
+                        variant="primary",
+                        elem_classes="main-button"
+                    )
+            gr.Markdown("---")
+            gr.Markdown("## 🎯 Segmented Objects")
+            seg_display = gr.Image(
+                label=None,
+                interactive=False,
+                show_label=False,
+                elem_classes="image-output"
+            )
+            if blackspot_ok:
+                gr.Markdown("## ⚫ Blackspot Detection")
+                blackspot_display = gr.Image(
+                    label=None,
+                    interactive=False,
+                    show_label=False,
+                    elem_classes="image-output"
+                )
+            else:
+                blackspot_display = gr.Image(visible=False)
+            gr.Markdown("## 🎨 Contrast Analysis")
+            contrast_display = gr.Image(
+                label=None,
+                interactive=False,
+                show_label=False,
+                elem_classes="image-output"
+            )
+            gr.Markdown("---")
+            analysis_report = gr.Markdown(
+                value="Upload an image and click 'Analyze Environment' to begin.",
+                elem_classes="report-box"
+            )
+            # Use image_input_display for the analysis
+            analyze_button.click(
+                fn=analyze_wrapper,
+                inputs=[
+                    image_input_display,
+                    blackspot_threshold,
+                    contrast_threshold,
+                    enable_blackspot,
+                    enable_contrast
+                ],
+                outputs=[
+                    seg_display,
+                    blackspot_display,
+                    contrast_display,
+                    analysis_report
+                ]
+            )
+            gr.Markdown("""
+                ---
+                **NeuroNest** v2.0 - Enhanced with floor-only blackspot detection and universal contrast analysis
+                *Creating safer environments for cognitive health through AI*
+                """)
+    return interface
+if __name__ == "__main__":
+    print(f"🚀 Starting NeuroNest on {DEVICE}")
+    print(f"OneFormer available: {ONEFORMER_AVAILABLE}")
+    try:
+        interface = create_gradio_interface()
+        interface.queue(max_size=10).launch(
+            server_name="0.0.0.0",
+            server_port=7860,
+            share=True
+        )
+    except Exception as e:
+        logger.error(f"Failed to launch application: {e}")
+        raise

install.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import subprocess
+import sys
+import os
+# Install torch first
+subprocess.check_call([sys.executable, "-m", "pip", "install", "torch>=2.0.0", "torchvision>=0.15.0"])
+# Clone and install detectron2
+if not os.path.exists('detectron2'):
+    subprocess.check_call(["git", "clone", "https://github.com/facebookresearch/detectron2"])
+# Install detectron2 dependencies
+import distutils.core
+dist = distutils.core.run_setup("./detectron2/setup.py")
+deps = ' '.join([f"'{x}'" for x in dist.install_requires if 'torch' not in x])
+subprocess.check_call(f"{sys.executable} -m pip install {deps}", shell=True)
+# Add detectron2 to path
+sys.path.insert(0, os.path.abspath('./detectron2'))

oneformer/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

oneformer/__init__.py ADDED Viewed

	@@ -0,0 +1,9 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+from . import data  # register all new datasets
+from . import modeling
+# config
+from .config import *
+# models
+from .oneformer_model import OneFormer

oneformer/config.py ADDED Viewed

	@@ -0,0 +1,239 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+from detectron2.config import CfgNode as CN
+__all__ = ["add_common_config", "add_oneformer_config", "add_swin_config",
+            "add_dinat_config", "add_beit_adapter_config", "add_convnext_config"]
+def add_common_config(cfg):
+    """
+    Add config for common configuration
+    """
+    # data config
+    # select the dataset mapper
+    cfg.INPUT.DATASET_MAPPER_NAME = "oneformer_unified"
+    # Color augmentation
+    cfg.INPUT.COLOR_AUG_SSD = False
+    # We retry random cropping until no single category in semantic segmentation GT occupies more
+    # than `SINGLE_CATEGORY_MAX_AREA` part of the crop.
+    cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA = 1.0
+    # Pad image and segmentation GT in dataset mapper.
+    cfg.INPUT.SIZE_DIVISIBILITY = -1
+    cfg.INPUT.TASK_SEQ_LEN = 77
+    cfg.INPUT.MAX_SEQ_LEN = 77
+    cfg.INPUT.TASK_PROB = CN()
+    cfg.INPUT.TASK_PROB.SEMANTIC = 0.33
+    cfg.INPUT.TASK_PROB.INSTANCE = 0.66
+    # test dataset
+    cfg.DATASETS.TEST_PANOPTIC = ("",)
+    cfg.DATASETS.TEST_INSTANCE = ("",)
+    cfg.DATASETS.TEST_SEMANTIC = ("",)
+    # solver config
+    # weight decay on embedding
+    cfg.SOLVER.WEIGHT_DECAY_EMBED = 0.0
+    # optimizer
+    cfg.SOLVER.OPTIMIZER = "ADAMW"
+    cfg.SOLVER.BACKBONE_MULTIPLIER = 0.1
+    # wandb
+    cfg.WANDB = CN()
+    cfg.WANDB.PROJECT = "unified_dense_recognition"
+    cfg.WANDB.NAME = None
+    cfg.MODEL.IS_TRAIN = False
+    cfg.MODEL.IS_DEMO = True
+    # text encoder config
+    cfg.MODEL.TEXT_ENCODER = CN()
+    cfg.MODEL.TEXT_ENCODER.WIDTH = 256
+    cfg.MODEL.TEXT_ENCODER.CONTEXT_LENGTH = 77
+    cfg.MODEL.TEXT_ENCODER.NUM_LAYERS = 12
+    cfg.MODEL.TEXT_ENCODER.VOCAB_SIZE = 49408
+    cfg.MODEL.TEXT_ENCODER.PROJ_NUM_LAYERS = 2
+    cfg.MODEL.TEXT_ENCODER.N_CTX = 16
+    # mask_former inference config
+    cfg.MODEL.TEST = CN()
+    cfg.MODEL.TEST.SEMANTIC_ON = True
+    cfg.MODEL.TEST.INSTANCE_ON = False
+    cfg.MODEL.TEST.PANOPTIC_ON = False
+    cfg.MODEL.TEST.DETECTION_ON = False
+    cfg.MODEL.TEST.OBJECT_MASK_THRESHOLD = 0.0
+    cfg.MODEL.TEST.OVERLAP_THRESHOLD = 0.0
+    cfg.MODEL.TEST.SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE = False
+    cfg.MODEL.TEST.TASK = "panoptic"
+    # TEST AUG Slide
+    cfg.TEST.AUG.IS_SLIDE = False
+    cfg.TEST.AUG.CROP_SIZE = (640, 640)
+    cfg.TEST.AUG.STRIDE = (426, 426)
+    cfg.TEST.AUG.SCALE = (2048, 640)
+    cfg.TEST.AUG.SETR_MULTI_SCALE = True
+    cfg.TEST.AUG.KEEP_RATIO = True
+    cfg.TEST.AUG.SIZE_DIVISOR = 32
+    # pixel decoder config
+    cfg.MODEL.SEM_SEG_HEAD.MASK_DIM = 256
+    # adding transformer in pixel decoder
+    cfg.MODEL.SEM_SEG_HEAD.TRANSFORMER_ENC_LAYERS = 0
+    # pixel decoder
+    cfg.MODEL.SEM_SEG_HEAD.PIXEL_DECODER_NAME = "BasePixelDecoder"
+    cfg.MODEL.SEM_SEG_HEAD.SEM_EMBED_DIM = 256
+    cfg.MODEL.SEM_SEG_HEAD.INST_EMBED_DIM = 256
+    # LSJ aug
+    cfg.INPUT.IMAGE_SIZE = 1024
+    cfg.INPUT.MIN_SCALE = 0.1
+    cfg.INPUT.MAX_SCALE = 2.0
+    # MSDeformAttn encoder configs
+    cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES = ["res3", "res4", "res5"]
+    cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_N_POINTS = 4
+    cfg.MODEL.SEM_SEG_HEAD.DEFORMABLE_TRANSFORMER_ENCODER_N_HEADS = 8
+def add_oneformer_config(cfg):
+    """
+    Add config for ONE_FORMER.
+    """
+    # mask_former model config
+    cfg.MODEL.ONE_FORMER = CN()
+    # loss
+    cfg.MODEL.ONE_FORMER.DEEP_SUPERVISION = True
+    cfg.MODEL.ONE_FORMER.NO_OBJECT_WEIGHT = 0.1
+    cfg.MODEL.ONE_FORMER.CLASS_WEIGHT = 1.0
+    cfg.MODEL.ONE_FORMER.DICE_WEIGHT = 1.0
+    cfg.MODEL.ONE_FORMER.MASK_WEIGHT = 20.0
+    cfg.MODEL.ONE_FORMER.CONTRASTIVE_WEIGHT = 0.5
+    cfg.MODEL.ONE_FORMER.CONTRASTIVE_TEMPERATURE = 0.07
+    # transformer config
+    cfg.MODEL.ONE_FORMER.NHEADS = 8
+    cfg.MODEL.ONE_FORMER.DROPOUT = 0.1
+    cfg.MODEL.ONE_FORMER.DIM_FEEDFORWARD = 2048
+    cfg.MODEL.ONE_FORMER.ENC_LAYERS = 0
+    cfg.MODEL.ONE_FORMER.CLASS_DEC_LAYERS = 2
+    cfg.MODEL.ONE_FORMER.DEC_LAYERS = 6
+    cfg.MODEL.ONE_FORMER.PRE_NORM = False
+    cfg.MODEL.ONE_FORMER.HIDDEN_DIM = 256
+    cfg.MODEL.ONE_FORMER.NUM_OBJECT_QUERIES = 120
+    cfg.MODEL.ONE_FORMER.NUM_OBJECT_CTX = 16
+    cfg.MODEL.ONE_FORMER.USE_TASK_NORM = True
+    cfg.MODEL.ONE_FORMER.TRANSFORMER_IN_FEATURE = "res5"
+    cfg.MODEL.ONE_FORMER.ENFORCE_INPUT_PROJ = False
+    # Sometimes `backbone.size_divisibility` is set to 0 for some backbone (e.g. ResNet)
+    # you can use this config to override
+    cfg.MODEL.ONE_FORMER.SIZE_DIVISIBILITY = 32
+    # transformer module
+    cfg.MODEL.ONE_FORMER.TRANSFORMER_DECODER_NAME = "ContrastiveMultiScaleMaskedTransformerDecoder"
+    # point loss configs
+    # Number of points sampled during training for a mask point head.
+    cfg.MODEL.ONE_FORMER.TRAIN_NUM_POINTS = 112 * 112
+    # Oversampling parameter for PointRend point sampling during training. Parameter `k` in the
+    # original paper.
+    cfg.MODEL.ONE_FORMER.OVERSAMPLE_RATIO = 3.0
+    # Importance sampling parameter for PointRend point sampling during training. Parametr `beta` in
+    # the original paper.
+    cfg.MODEL.ONE_FORMER.IMPORTANCE_SAMPLE_RATIO = 0.75
+def add_swin_config(cfg):
+    """
+    Add config forSWIN Backbone.
+    """
+    # swin transformer backbone
+    cfg.MODEL.SWIN = CN()
+    cfg.MODEL.SWIN.PRETRAIN_IMG_SIZE = 224
+    cfg.MODEL.SWIN.PATCH_SIZE = 4
+    cfg.MODEL.SWIN.EMBED_DIM = 96
+    cfg.MODEL.SWIN.DEPTHS = [2, 2, 6, 2]
+    cfg.MODEL.SWIN.NUM_HEADS = [3, 6, 12, 24]
+    cfg.MODEL.SWIN.WINDOW_SIZE = 7
+    cfg.MODEL.SWIN.MLP_RATIO = 4.0
+    cfg.MODEL.SWIN.QKV_BIAS = True
+    cfg.MODEL.SWIN.QK_SCALE = None
+    cfg.MODEL.SWIN.DROP_RATE = 0.0
+    cfg.MODEL.SWIN.ATTN_DROP_RATE = 0.0
+    cfg.MODEL.SWIN.DROP_PATH_RATE = 0.3
+    cfg.MODEL.SWIN.APE = False
+    cfg.MODEL.SWIN.PATCH_NORM = True
+    cfg.MODEL.SWIN.OUT_FEATURES = ["res2", "res3", "res4", "res5"]
+    cfg.MODEL.SWIN.USE_CHECKPOINT = False
+    ## Semask additions
+    cfg.MODEL.SWIN.SEM_WINDOW_SIZE = 7
+    cfg.MODEL.SWIN.NUM_SEM_BLOCKS = 1
+def add_dinat_config(cfg):
+    """
+    Add config for NAT Backbone.
+    """
+    # DINAT transformer backbone
+    cfg.MODEL.DiNAT = CN()
+    cfg.MODEL.DiNAT.DEPTHS = [3, 4, 18, 5]
+    cfg.MODEL.DiNAT.OUT_FEATURES = ["res2", "res3", "res4", "res5"]
+    cfg.MODEL.DiNAT.EMBED_DIM = 64
+    cfg.MODEL.DiNAT.MLP_RATIO = 3.0
+    cfg.MODEL.DiNAT.NUM_HEADS = [2, 4, 8, 16]
+    cfg.MODEL.DiNAT.DROP_PATH_RATE = 0.2
+    cfg.MODEL.DiNAT.KERNEL_SIZE = 7
+    cfg.MODEL.DiNAT.DILATIONS = [[1, 16, 1], [1, 4, 1, 8], [1, 2, 1, 3, 1, 4], [1, 2, 1, 2, 1]]
+    cfg.MODEL.DiNAT.OUT_INDICES = (0, 1, 2, 3)
+    cfg.MODEL.DiNAT.QKV_BIAS = True
+    cfg.MODEL.DiNAT.QK_SCALE = None
+    cfg.MODEL.DiNAT.DROP_RATE = 0
+    cfg.MODEL.DiNAT.ATTN_DROP_RATE = 0.
+    cfg.MODEL.DiNAT.IN_PATCH_SIZE = 4
+def add_convnext_config(cfg):
+    """
+    Add config for ConvNeXt Backbone.
+    """
+    # swin transformer backbone
+    cfg.MODEL.CONVNEXT = CN()
+    cfg.MODEL.CONVNEXT.IN_CHANNELS = 3
+    cfg.MODEL.CONVNEXT.DEPTHS = [3, 3, 27, 3]
+    cfg.MODEL.CONVNEXT.DIMS = [192, 384, 768, 1536]
+    cfg.MODEL.CONVNEXT.DROP_PATH_RATE = 0.4
+    cfg.MODEL.CONVNEXT.LSIT = 1.0
+    cfg.MODEL.CONVNEXT.OUT_INDICES = [0, 1, 2, 3]
+    cfg.MODEL.CONVNEXT.OUT_FEATURES = ["res2", "res3", "res4", "res5"]
+def add_beit_adapter_config(cfg):
+    """
+    Add config for BEiT Adapter Backbone.
+    """
+    # beit adapter backbone
+    cfg.MODEL.BEiTAdapter = CN()
+    cfg.MODEL.BEiTAdapter.IMG_SIZE = 640
+    cfg.MODEL.BEiTAdapter.PATCH_SIZE = 16
+    cfg.MODEL.BEiTAdapter.EMBED_DIM = 1024
+    cfg.MODEL.BEiTAdapter.DEPTH = 24
+    cfg.MODEL.BEiTAdapter.NUM_HEADS = 16
+    cfg.MODEL.BEiTAdapter.MLP_RATIO = 4
+    cfg.MODEL.BEiTAdapter.QKV_BIAS = True
+    cfg.MODEL.BEiTAdapter.USE_ABS_POS_EMB = False
+    cfg.MODEL.BEiTAdapter.USE_REL_POS_BIAS = True
+    cfg.MODEL.BEiTAdapter.INIT_VALUES = 1e-6
+    cfg.MODEL.BEiTAdapter.DROP_PATH_RATE = 0.3
+    cfg.MODEL.BEiTAdapter.CONV_INPLANE = 64
+    cfg.MODEL.BEiTAdapter.N_POINTS = 4
+    cfg.MODEL.BEiTAdapter.DEFORM_NUM_HEADS = 16
+    cfg.MODEL.BEiTAdapter.CFFN_RATIO = 0.25
+    cfg.MODEL.BEiTAdapter.DEFORM_RATIO = 0.5
+    cfg.MODEL.BEiTAdapter.WITH_CP = True
+    cfg.MODEL.BEiTAdapter.INTERACTION_INDEXES=[[0, 5], [6, 11], [12, 17], [18, 23]]
+    cfg.MODEL.BEiTAdapter.OUT_FEATURES = ["res2", "res3", "res4", "res5"]

oneformer/data/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Copyright (c) Facebook, Inc. and its affiliates.
2	+ from . import datasets

oneformer/data/bpe_simple_vocab_16e6.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

oneformer/data/build.py ADDED Viewed

	@@ -0,0 +1,117 @@

+# Copyright (c) Facebook, Inc. and its affiliates.
+from typing import Any, Callable, Dict, List, Optional, Union
+import torch.utils.data as torchdata
+from detectron2.config import configurable
+from detectron2.data.common import DatasetFromList, MapDataset
+from detectron2.data.dataset_mapper import DatasetMapper
+from detectron2.data.samplers import (
+    InferenceSampler,
+)
+from detectron2.data.build import (
+    get_detection_dataset_dicts,
+    trivial_batch_collator
+)
+"""
+This file contains the default logic to build a dataloader for training or testing.
+"""
+__all__ = [
+    "build_detection_test_loader",
+]
+def _test_loader_from_config(cfg, dataset_name, mapper=None):
+    """
+    Uses the given `dataset_name` argument (instead of the names in cfg), because the
+    standard practice is to evaluate each test set individually (not combining them).
+    """
+    if isinstance(dataset_name, str):
+        dataset_name = [dataset_name]
+    dataset = get_detection_dataset_dicts(
+        dataset_name,
+        filter_empty=False,
+        proposal_files=[
+            cfg.DATASETS.PROPOSAL_FILES_TEST[list(cfg.DATASETS.TEST).index(x)] for x in dataset_name
+        ]
+        if cfg.MODEL.LOAD_PROPOSALS
+        else None,
+    )
+    if mapper is None:
+        mapper = DatasetMapper(cfg, False)
+    return {
+        "dataset": dataset,
+        "mapper": mapper,
+        "num_workers": cfg.DATALOADER.NUM_WORKERS,
+        "sampler": InferenceSampler(len(dataset))
+        if not isinstance(dataset, torchdata.IterableDataset)
+        else None,
+    }
+@configurable(from_config=_test_loader_from_config)
+def build_detection_test_loader(
+    dataset: Union[List[Any], torchdata.Dataset],
+    *,
+    mapper: Callable[[Dict[str, Any]], Any],
+    sampler: Optional[torchdata.Sampler] = None,
+    batch_size: int = 1,
+    num_workers: int = 0,
+    collate_fn: Optional[Callable[[List[Any]], Any]] = None,
+) -> torchdata.DataLoader:
+    """
+    Similar to `build_detection_train_loader`, with default batch size = 1,
+    and sampler = :class:`InferenceSampler`. This sampler coordinates all workers
+    to produce the exact set of all samples.
+    Args:
+        dataset: a list of dataset dicts,
+            or a pytorch dataset (either map-style or iterable). They can be obtained
+            by using :func:`DatasetCatalog.get` or :func:`get_detection_dataset_dicts`.
+        mapper: a callable which takes a sample (dict) from dataset
+           and returns the format to be consumed by the model.
+           When using cfg, the default choice is ``DatasetMapper(cfg, is_train=False)``.
+        sampler: a sampler that produces
+            indices to be applied on ``dataset``. Default to :class:`InferenceSampler`,
+            which splits the dataset across all workers. Sampler must be None
+            if `dataset` is iterable.
+        batch_size: the batch size of the data loader to be created.
+            Default to 1 image per worker since this is the standard when reporting
+            inference time in papers.
+        num_workers: number of parallel data loading workers
+        collate_fn: same as the argument of `torch.utils.data.DataLoader`.
+            Defaults to do no collation and return a list of data.
+    Returns:
+        DataLoader: a torch DataLoader, that loads the given detection
+        dataset, with test-time transformation and batching.
+    Examples:
+    ::
+        data_loader = build_detection_test_loader(
+            DatasetRegistry.get("my_test"),
+            mapper=DatasetMapper(...))
+        # or, instantiate with a CfgNode:
+        data_loader = build_detection_test_loader(cfg, "my_test")
+    """
+    if isinstance(dataset, list):
+        dataset = DatasetFromList(dataset, copy=False)
+    if mapper is not None:
+        dataset = MapDataset(dataset, mapper)
+    if isinstance(dataset, torchdata.IterableDataset):
+        assert sampler is None, "sampler must be None if dataset is IterableDataset"
+    else:
+        if sampler is None:
+            sampler = InferenceSampler(len(dataset))
+    return torchdata.DataLoader(
+        dataset,
+        batch_size=batch_size,
+        sampler=sampler,
+        drop_last=False,
+        num_workers=num_workers,
+        collate_fn=trivial_batch_collator if collate_fn is None else collate_fn,
+    )

oneformer/data/dataset_mappers/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+

oneformer/data/dataset_mappers/coco_unified_new_baseline_dataset_mapper.py ADDED Viewed

	@@ -0,0 +1,341 @@

+# ------------------------------------------------------------------------------
+# Reference: https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/data/dataset_mappers/coco_panoptic_new_baseline_dataset_mapper.py
+# Modified by Jitesh Jain (https://github.com/praeclarumjj3)
+# ------------------------------------------------------------------------------
+import copy
+import logging
+import numpy as np
+import torch
+from detectron2.data import MetadataCatalog
+from detectron2.config import configurable
+from detectron2.data import detection_utils as utils
+from detectron2.data import transforms as T
+from detectron2.structures import BitMasks, Instances
+from oneformer.utils.box_ops import masks_to_boxes
+from oneformer.data.tokenizer import SimpleTokenizer, Tokenize
+__all__ = ["COCOUnifiedNewBaselineDatasetMapper"]
+def build_transform_gen(cfg, is_train):
+    """
+    Create a list of default :class:`Augmentation` from config.
+    Now it includes resizing and flipping.
+    Returns:
+        list[Augmentation]
+    """
+    assert is_train, "Only support training augmentation"
+    image_size = cfg.INPUT.IMAGE_SIZE
+    min_scale = cfg.INPUT.MIN_SCALE
+    max_scale = cfg.INPUT.MAX_SCALE
+    augmentation = []
+    if cfg.INPUT.RANDOM_FLIP != "none":
+        augmentation.append(
+            T.RandomFlip(
+                horizontal=cfg.INPUT.RANDOM_FLIP == "horizontal",
+                vertical=cfg.INPUT.RANDOM_FLIP == "vertical",
+            )
+        )
+    augmentation.extend([
+        T.ResizeScale(
+            min_scale=min_scale, max_scale=max_scale, target_height=image_size, target_width=image_size
+        ),
+        T.FixedSizeCrop(crop_size=(image_size, image_size)),
+    ])
+    return augmentation
+# This is specifically designed for the COCO dataset.
+class COCOUnifiedNewBaselineDatasetMapper:
+    """
+    A callable which takes a dataset dict in Detectron2 Dataset format,
+    and map it into a format used by OneFormer.
+    This dataset mapper applies the same transformation as DETR for COCO panoptic segmentation.
+    The callable currently does the following:
+    1. Read the image from "file_name"
+    2. Applies geometric transforms to the image and annotation
+    3. Find and applies suitable cropping to the image and annotation
+    4. Prepare image and annotation to Tensors
+    """
+    @configurable
+    def __init__(
+        self,
+        is_train=True,
+        *,
+        num_queries,
+        tfm_gens,
+        meta,
+        image_format,
+        max_seq_len,
+        task_seq_len,
+        semantic_prob,
+        instance_prob,
+    ):
+        """
+        NOTE: this interface is experimental.
+        Args:
+            is_train: for training or inference
+            augmentations: a list of augmentations or deterministic transforms to apply
+            crop_gen: crop augmentation
+            tfm_gens: data augmentation
+            image_format: an image format supported by :func:`detection_utils.read_image`.
+        """
+        self.tfm_gens = tfm_gens
+        logging.getLogger(__name__).info(
+            "[COCOUnifiedNewBaselineDatasetMapper] Full TransformGens used in training: {}".format(
+                str(self.tfm_gens)
+            )
+        )
+        self.img_format = image_format
+        self.is_train = is_train
+        self.meta = meta
+        self.ignore_label = self.meta.ignore_label
+        self.num_queries = num_queries
+        self.things = []
+        for k,v in self.meta.thing_dataset_id_to_contiguous_id.items():
+            self.things.append(v)
+        self.class_names = self.meta.stuff_classes
+        self.text_tokenizer = Tokenize(SimpleTokenizer(), max_seq_len=max_seq_len)
+        self.task_tokenizer = Tokenize(SimpleTokenizer(), max_seq_len=task_seq_len)
+        self.semantic_prob = semantic_prob
+        self.instance_prob = instance_prob
+    @classmethod
+    def from_config(cls, cfg, is_train=True):
+        # Build augmentation
+        tfm_gens = build_transform_gen(cfg, is_train)
+        dataset_names = cfg.DATASETS.TRAIN
+        meta = MetadataCatalog.get(dataset_names[0])
+        ret = {
+            "is_train": is_train,
+            "meta": meta,
+            "tfm_gens": tfm_gens,
+            "image_format": cfg.INPUT.FORMAT,
+            "num_queries": cfg.MODEL.ONE_FORMER.NUM_OBJECT_QUERIES - cfg.MODEL.TEXT_ENCODER.N_CTX,
+            "task_seq_len": cfg.INPUT.TASK_SEQ_LEN,
+            "max_seq_len": cfg.INPUT.MAX_SEQ_LEN,
+            "semantic_prob": cfg.INPUT.TASK_PROB.SEMANTIC,
+            "instance_prob": cfg.INPUT.TASK_PROB.INSTANCE,
+        }
+        return ret
+    def _get_semantic_dict(self, pan_seg_gt, image_shape, segments_info, num_class_obj):
+        instances = Instances(image_shape)
+        classes = []
+        texts = ["a semantic photo"] * self.num_queries
+        masks = []
+        label = np.ones_like(pan_seg_gt) * self.ignore_label
+        for segment_info in segments_info:
+            class_id = segment_info["category_id"]
+            if not segment_info["iscrowd"]:
+                mask = pan_seg_gt == segment_info["id"]
+                if not np.all(mask == False):
+                    if class_id not in classes:
+                        cls_name = self.class_names[class_id]
+                        classes.append(class_id)
+                        masks.append(mask)
+                        num_class_obj[cls_name] += 1
+                    else:
+                        idx = classes.index(class_id)
+                        masks[idx] += mask
+                        masks[idx] = np.clip(masks[idx], 0, 1).astype(np.bool)
+                    label[mask] = class_id
+        num = 0
+        for i, cls_name in enumerate(self.class_names):
+            if num_class_obj[cls_name] > 0:
+                for _ in range(num_class_obj[cls_name]):
+                    if num >= len(texts):
+                        break
+                    texts[num] = f"a photo with a {cls_name}"
+                    num += 1
+        classes = np.array(classes)
+        instances.gt_classes = torch.tensor(classes, dtype=torch.int64)
+        if len(masks) == 0:
+            # Some image does not have annotation (all ignored)
+            instances.gt_masks = torch.zeros((0, pan_seg_gt.shape[-2], pan_seg_gt.shape[-1]))
+            instances.gt_bboxes = torch.zeros((0, 4))
+        else:
+            masks = BitMasks(
+                torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks])
+            )
+            instances.gt_masks = masks.tensor
+            # Placeholder bounding boxes for stuff regions. Note that these are not used during training.
+            instances.gt_bboxes = torch.stack([torch.tensor([0., 0., 1., 1.])] * instances.gt_masks.shape[0])
+        return instances, texts, label
+    def _get_instance_dict(self, pan_seg_gt, image_shape, segments_info, num_class_obj):
+        instances = Instances(image_shape)
+        classes = []
+        texts = ["an instance photo"] * self.num_queries
+        masks = []
+        label = np.ones_like(pan_seg_gt) * self.ignore_label
+        for segment_info in segments_info:
+            class_id = segment_info["category_id"]
+            if class_id in self.things:
+                if not segment_info["iscrowd"]:
+                    mask = pan_seg_gt == segment_info["id"]
+                    if not np.all(mask == False):
+                        cls_name = self.class_names[class_id]
+                        classes.append(class_id)
+                        masks.append(mask)
+                        num_class_obj[cls_name] += 1
+                        label[mask] = class_id
+        num = 0
+        for i, cls_name in enumerate(self.class_names):
+            if num_class_obj[cls_name] > 0:
+                for _ in range(num_class_obj[cls_name]):
+                    if num >= len(texts):
+                        break
+                    texts[num] = f"a photo with a {cls_name}"
+                    num += 1
+        classes = np.array(classes)
+        instances.gt_classes = torch.tensor(classes, dtype=torch.int64)
+        if len(masks) == 0:
+            # Some image does not have annotation (all ignored)
+            instances.gt_masks = torch.zeros((0, pan_seg_gt.shape[-2], pan_seg_gt.shape[-1]))
+            instances.gt_bboxes = torch.zeros((0, 4))
+        else:
+            masks = BitMasks(
+                torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks])
+            )
+            instances.gt_masks = masks.tensor
+            instances.gt_bboxes = masks_to_boxes(instances.gt_masks)
+        return instances, texts, label
+    def _get_panoptic_dict(self, pan_seg_gt, image_shape, segments_info, num_class_obj):
+        instances = Instances(image_shape)
+        classes = []
+        texts = ["a panoptic photo"] * self.num_queries
+        masks = []
+        label = np.ones_like(pan_seg_gt) * self.ignore_label
+        for segment_info in segments_info:
+            class_id = segment_info["category_id"]
+            if not segment_info["iscrowd"]:
+                mask = pan_seg_gt == segment_info["id"]
+                if not np.all(mask == False):
+                    cls_name = self.class_names[class_id]
+                    classes.append(class_id)
+                    masks.append(mask)
+                    num_class_obj[cls_name] += 1
+                    label[mask] = class_id
+        num = 0
+        for i, cls_name in enumerate(self.class_names):
+            if num_class_obj[cls_name] > 0:
+                for _ in range(num_class_obj[cls_name]):
+                    if num >= len(texts):
+                        break
+                    texts[num] = f"a photo with a {cls_name}"
+                    num += 1
+        classes = np.array(classes)
+        instances.gt_classes = torch.tensor(classes, dtype=torch.int64)
+        if len(masks) == 0:
+            # Some image does not have annotation (all ignored)
+            instances.gt_masks = torch.zeros((0, pan_seg_gt.shape[-2], pan_seg_gt.shape[-1]))
+            instances.gt_bboxes = torch.zeros((0, 4))
+        else:
+            masks = BitMasks(
+                torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks])
+            )
+            instances.gt_masks = masks.tensor
+            instances.gt_bboxes = masks_to_boxes(instances.gt_masks)
+            for i in range(instances.gt_classes.shape[0]):
+                # Placeholder bounding boxes for stuff regions. Note that these are not used during training.
+                if instances.gt_classes[i].item() not in self.things:
+                    instances.gt_bboxes[i] = torch.tensor([0., 0., 1., 1.])
+        return instances, texts, label
+    def __call__(self, dataset_dict):
+        """
+        Args:
+            dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
+        Returns:
+            dict: a format that builtin models in detectron2 accept
+        """
+        dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
+        image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
+        utils.check_image_size(dataset_dict, image)
+        image, transforms = T.apply_transform_gens(self.tfm_gens, image)
+        image_shape = image.shape[:2]  # h, w
+        # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
+        # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
+        # Therefore it's important to use torch.Tensor.
+        dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
+        if not self.is_train:
+            # USER: Modify this if you want to keep them for some reason.
+            dataset_dict.pop("annotations", None)
+            return dataset_dict
+        # semantic segmentation
+        if "sem_seg_file_name" in dataset_dict:
+            # PyTorch transformation not implemented for uint16, so converting it to double first
+            sem_seg_gt = utils.read_image(dataset_dict.pop("sem_seg_file_name")).astype("double")
+            sem_seg_gt = transforms.apply_segmentation(sem_seg_gt)
+        else:
+            sem_seg_gt = None
+        if "pan_seg_file_name" in dataset_dict:
+            pan_seg_gt = utils.read_image(dataset_dict.pop("pan_seg_file_name"), "RGB")
+            segments_info = dataset_dict["segments_info"]
+            # apply the same transformation to panoptic segmentation
+            pan_seg_gt = transforms.apply_segmentation(pan_seg_gt)
+            from panopticapi.utils import rgb2id
+            pan_seg_gt = rgb2id(pan_seg_gt)
+        prob_task = np.random.uniform(0,1.)
+        num_class_obj = {}
+        for name in self.class_names:
+            num_class_obj[name] = 0
+        if prob_task < self.semantic_prob:
+            task = "The task is semantic"
+            instances, text, sem_seg = self._get_semantic_dict(pan_seg_gt, image_shape, segments_info, num_class_obj)
+        elif prob_task < self.instance_prob:
+            task = "The task is instance"
+            instances, text, sem_seg = self._get_instance_dict(pan_seg_gt, image_shape, segments_info, num_class_obj)
+        else:
+            task = "The task is panoptic"
+            instances, text, sem_seg = self._get_panoptic_dict(pan_seg_gt, image_shape, segments_info, num_class_obj)
+        dataset_dict["sem_seg"] = torch.from_numpy(sem_seg).long()
+        dataset_dict["instances"] = instances
+        dataset_dict["orig_shape"] = image_shape
+        dataset_dict["task"] = task
+        dataset_dict["text"] = text
+        dataset_dict["thing_ids"] = self.things
+        return dataset_dict

oneformer/data/dataset_mappers/dataset_mapper.py ADDED Viewed

	@@ -0,0 +1,203 @@

+# ------------------------------------------------------------------------------
+# Reference: https://github.com/facebookresearch/detectron2/blob/main/detectron2/data/dataset_mapper.py
+# Modified by Jitesh Jain (https://github.com/praeclarumjj3)
+# ------------------------------------------------------------------------------
+import copy
+import logging
+import numpy as np
+from typing import List, Optional, Union
+import torch
+from detectron2.config import configurable
+from detectron2.data import detection_utils as utils
+from detectron2.data import transforms as T
+from oneformer.data.tokenizer import SimpleTokenizer, Tokenize
+__all__ = ["DatasetMapper"]
+class DatasetMapper:
+    """
+    A callable which takes a dataset dict in Detectron2 Dataset format,
+    and map it into a format used by the model.
+    This is the default callable to be used to map your dataset dict into training data.
+    You may need to follow it to implement your own one for customized logic,
+    such as a different way to read or transform images.
+    See :doc:`/tutorials/data_loading` for details.
+    The callable currently does the following:
+    1. Read the image from "file_name"
+    2. Applies cropping/geometric transforms to the image and annotations
+    3. Prepare data and annotations to Tensor and :class:`Instances`
+    """
+    @configurable
+    def __init__(
+        self,
+        is_train: bool,
+        *,
+        augmentations: List[Union[T.Augmentation, T.Transform]],
+        image_format: str,
+        task_seq_len: int,
+        task: str = "panoptic",
+        use_instance_mask: bool = False,
+        use_keypoint: bool = False,
+        instance_mask_format: str = "polygon",
+        keypoint_hflip_indices: Optional[np.ndarray] = None,
+        precomputed_proposal_topk: Optional[int] = None,
+        recompute_boxes: bool = False,
+    ):
+        """
+        NOTE: this interface is experimental.
+        Args:
+            is_train: whether it's used in training or inference
+            augmentations: a list of augmentations or deterministic transforms to apply
+            image_format: an image format supported by :func:`detection_utils.read_image`.
+            use_instance_mask: whether to process instance segmentation annotations, if available
+            use_keypoint: whether to process keypoint annotations if available
+            instance_mask_format: one of "polygon" or "bitmask". Process instance segmentation
+                masks into this format.
+            keypoint_hflip_indices: see :func:`detection_utils.create_keypoint_hflip_indices`
+            precomputed_proposal_topk: if given, will load pre-computed
+                proposals from dataset_dict and keep the top k proposals for each image.
+            recompute_boxes: whether to overwrite bounding box annotations
+                by computing tight bounding boxes from instance mask annotations.
+        """
+        if recompute_boxes:
+            assert use_instance_mask, "recompute_boxes requires instance masks"
+        # fmt: off
+        self.is_train               = is_train
+        self.augmentations          = T.AugmentationList(augmentations)
+        self.image_format           = image_format
+        self.use_instance_mask      = use_instance_mask
+        self.instance_mask_format   = instance_mask_format
+        self.use_keypoint           = use_keypoint
+        self.keypoint_hflip_indices = keypoint_hflip_indices
+        self.proposal_topk          = precomputed_proposal_topk
+        self.recompute_boxes        = recompute_boxes
+        self.task_tokenizer = Tokenize(SimpleTokenizer(), max_seq_len=task_seq_len)
+        self.task = task
+        assert self.task in ["panoptic", "semantic", "instance"]
+        # fmt: on
+        logger = logging.getLogger(__name__)
+        mode = "training" if is_train else "inference"
+        logger.info(f"[DatasetMapper] Augmentations used in {mode}: {augmentations}")
+    @classmethod
+    def from_config(cls, cfg, is_train: bool = True):
+        augs = utils.build_augmentation(cfg, is_train)
+        if cfg.INPUT.CROP.ENABLED and is_train:
+            augs.insert(0, T.RandomCrop(cfg.INPUT.CROP.TYPE, cfg.INPUT.CROP.SIZE))
+            recompute_boxes = cfg.MODEL.MASK_ON
+        else:
+            recompute_boxes = False
+        ret = {
+            "is_train": is_train,
+            "augmentations": augs,
+            "image_format": cfg.INPUT.FORMAT,
+            "use_instance_mask": cfg.MODEL.MASK_ON,
+            "instance_mask_format": cfg.INPUT.MASK_FORMAT,
+            "use_keypoint": cfg.MODEL.KEYPOINT_ON,
+            "task_seq_len": cfg.INPUT.TASK_SEQ_LEN,
+            "recompute_boxes": recompute_boxes,
+            "task": cfg.MODEL.TEST.TASK,
+        }
+        if cfg.MODEL.KEYPOINT_ON:
+            ret["keypoint_hflip_indices"] = utils.create_keypoint_hflip_indices(cfg.DATASETS.TRAIN)
+        if cfg.MODEL.LOAD_PROPOSALS:
+            ret["precomputed_proposal_topk"] = (
+                cfg.DATASETS.PRECOMPUTED_PROPOSAL_TOPK_TRAIN
+                if is_train
+                else cfg.DATASETS.PRECOMPUTED_PROPOSAL_TOPK_TEST
+            )
+        return ret
+    def _transform_annotations(self, dataset_dict, transforms, image_shape):
+        # USER: Modify this if you want to keep them for some reason.
+        for anno in dataset_dict["annotations"]:
+            if not self.use_instance_mask:
+                anno.pop("segmentation", None)
+            if not self.use_keypoint:
+                anno.pop("keypoints", None)
+        # USER: Implement additional transformations if you have other types of data
+        annos = [
+            utils.transform_instance_annotations(
+                obj, transforms, image_shape, keypoint_hflip_indices=self.keypoint_hflip_indices
+            )
+            for obj in dataset_dict.pop("annotations")
+            if obj.get("iscrowd", 0) == 0
+        ]
+        instances = utils.annotations_to_instances(
+            annos, image_shape, mask_format=self.instance_mask_format
+        )
+        # After transforms such as cropping are applied, the bounding box may no longer
+        # tightly bound the object. As an example, imagine a triangle object
+        # [(0,0), (2,0), (0,2)] cropped by a box [(1,0),(2,2)] (XYXY format). The tight
+        # bounding box of the cropped triangle should be [(1,0),(2,1)], which is not equal to
+        # the intersection of original bounding box and the cropping box.
+        if self.recompute_boxes:
+            instances.gt_boxes = instances.gt_masks.get_bounding_boxes()
+        dataset_dict["instances"] = utils.filter_empty_instances(instances)
+    def __call__(self, dataset_dict):
+        """
+        Args:
+            dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
+        Returns:
+            dict: a format that builtin models in detectron2 accept
+        """
+        dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
+        # USER: Write your own image loading if it's not from a file
+        image = utils.read_image(dataset_dict["file_name"], format=self.image_format)
+        utils.check_image_size(dataset_dict, image)
+        task = f"The task is {self.task}"
+        dataset_dict["task"] = task
+        # USER: Remove if you don't do semantic/panoptic segmentation.
+        if "sem_seg_file_name" in dataset_dict:
+            sem_seg_gt = utils.read_image(dataset_dict.pop("sem_seg_file_name"), "L").squeeze(2)
+        else:
+            sem_seg_gt = None
+        aug_input = T.AugInput(image, sem_seg=sem_seg_gt)
+        transforms = self.augmentations(aug_input)
+        image, sem_seg_gt = aug_input.image, aug_input.sem_seg
+        image_shape = image.shape[:2]  # h, w
+        # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
+        # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
+        # Therefore it's important to use torch.Tensor.
+        dataset_dict["image"] = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
+        if sem_seg_gt is not None:
+            dataset_dict["sem_seg"] = torch.as_tensor(sem_seg_gt.astype("long"))
+        # USER: Remove if you don't use pre-computed proposals.
+        # Most users would not need this feature.
+        if self.proposal_topk is not None:
+            utils.transform_proposals(
+                dataset_dict, image_shape, transforms, proposal_topk=self.proposal_topk
+            )
+        if not self.is_train:
+            # USER: Modify this if you want to keep them for some reason.
+            dataset_dict.pop("annotations", None)
+            dataset_dict.pop("sem_seg_file_name", None)
+            return dataset_dict
+        if "annotations" in dataset_dict:
+            self._transform_annotations(dataset_dict, transforms, image_shape)
+        return dataset_dict

oneformer/data/dataset_mappers/oneformer_unified_dataset_mapper.py ADDED Viewed

	@@ -0,0 +1,375 @@

+# ------------------------------------------------------------------------------
+# Reference: https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/data/dataset_mappers/mask_former_panoptic_dataset_mapper.py
+# Modified by Jitesh Jain (https://github.com/praeclarumjj3)
+# ------------------------------------------------------------------------------
+import copy
+import logging
+import os
+import numpy as np
+import torch
+from torch.nn import functional as F
+from detectron2.config import configurable
+from detectron2.data import detection_utils as utils
+from detectron2.data import transforms as T
+from detectron2.structures import BitMasks, Instances
+from detectron2.data import MetadataCatalog
+from detectron2.projects.point_rend import ColorAugSSDTransform
+from oneformer.utils.box_ops import masks_to_boxes
+from oneformer.data.tokenizer import SimpleTokenizer, Tokenize
+__all__ = ["OneFormerUnifiedDatasetMapper"]
+class OneFormerUnifiedDatasetMapper:
+    """
+    A callable which takes a dataset dict in Detectron2 Dataset format,
+    and map it into a format used by OneFormer for universal segmentation.
+    The callable currently does the following:
+    1. Read the image from "file_name"
+    2. Applies geometric transforms to the image and annotation
+    3. Find and applies suitable cropping to the image and annotation
+    4. Prepare image and annotation to Tensors
+    """
+    @configurable
+    def __init__(
+        self,
+        is_train=True,
+        *,
+        name,
+        num_queries,
+        meta,
+        augmentations,
+        image_format,
+        ignore_label,
+        size_divisibility,
+        task_seq_len,
+        max_seq_len,
+        semantic_prob,
+        instance_prob,
+    ):
+        """
+        NOTE: this interface is experimental.
+        Args:
+            is_train: for training or inference
+            augmentations: a list of augmentations or deterministic transforms to apply
+            image_format: an image format supported by :func:`detection_utils.read_image`.
+            ignore_label: the label that is ignored to evaluation
+            size_divisibility: pad image size to be divisible by this value
+        """
+        self.is_train = is_train
+        self.meta = meta
+        self.name = name
+        self.tfm_gens = augmentations
+        self.img_format = image_format
+        self.ignore_label = ignore_label
+        self.size_divisibility = size_divisibility
+        self.num_queries = num_queries
+        logger = logging.getLogger(__name__)
+        mode = "training" if is_train else "inference"
+        logger.info(f"[{self.__class__.__name__}] Augmentations used in {mode}: {augmentations}")
+        self.things = []
+        for k,v in self.meta.thing_dataset_id_to_contiguous_id.items():
+            self.things.append(v)
+        self.class_names = self.meta.stuff_classes
+        self.text_tokenizer = Tokenize(SimpleTokenizer(), max_seq_len=max_seq_len)
+        self.task_tokenizer = Tokenize(SimpleTokenizer(), max_seq_len=task_seq_len)
+        self.semantic_prob = semantic_prob
+        self.instance_prob = instance_prob
+    @classmethod
+    def from_config(cls, cfg, is_train=True):
+        # Build augmentation
+        augs = [
+            T.ResizeShortestEdge(
+                cfg.INPUT.MIN_SIZE_TRAIN,
+                cfg.INPUT.MAX_SIZE_TRAIN,
+                cfg.INPUT.MIN_SIZE_TRAIN_SAMPLING,
+            )
+        ]
+        if cfg.INPUT.CROP.ENABLED:
+            augs.append(
+                T.RandomCrop_CategoryAreaConstraint(
+                    cfg.INPUT.CROP.TYPE,
+                    cfg.INPUT.CROP.SIZE,
+                    cfg.INPUT.CROP.SINGLE_CATEGORY_MAX_AREA,
+                    cfg.MODEL.SEM_SEG_HEAD.IGNORE_VALUE,
+                )
+            )
+        if cfg.INPUT.COLOR_AUG_SSD:
+            augs.append(ColorAugSSDTransform(img_format=cfg.INPUT.FORMAT))
+        augs.append(T.RandomFlip())
+        # Assume always applies to the training set.
+        dataset_names = cfg.DATASETS.TRAIN
+        meta = MetadataCatalog.get(dataset_names[0])
+        ignore_label = meta.ignore_label
+        ret = {
+            "is_train": is_train,
+            "meta": meta,
+            "name": dataset_names[0],
+            "num_queries": cfg.MODEL.ONE_FORMER.NUM_OBJECT_QUERIES - cfg.MODEL.TEXT_ENCODER.N_CTX,
+            "task_seq_len": cfg.INPUT.TASK_SEQ_LEN,
+            "max_seq_len": cfg.INPUT.MAX_SEQ_LEN,
+            "augmentations": augs,
+            "image_format": cfg.INPUT.FORMAT,
+            "ignore_label": ignore_label,
+            "size_divisibility": cfg.INPUT.SIZE_DIVISIBILITY,
+            "semantic_prob": cfg.INPUT.TASK_PROB.SEMANTIC,
+            "instance_prob": cfg.INPUT.TASK_PROB.INSTANCE,
+        }
+        return ret
+    def _get_semantic_dict(self, pan_seg_gt, image_shape, segments_info, num_class_obj):
+        pan_seg_gt = pan_seg_gt.numpy()
+        instances = Instances(image_shape)
+        classes = []
+        texts = ["a semantic photo"] * self.num_queries
+        masks = []
+        label = np.ones_like(pan_seg_gt) * self.ignore_label
+        for segment_info in segments_info:
+            class_id = segment_info["category_id"]
+            if not segment_info["iscrowd"]:
+                mask = pan_seg_gt == segment_info["id"]
+                if not np.all(mask == False):
+                    if class_id not in classes:
+                        cls_name = self.class_names[class_id]
+                        classes.append(class_id)
+                        masks.append(mask)
+                        num_class_obj[cls_name] += 1
+                    else:
+                        idx = classes.index(class_id)
+                        masks[idx] += mask
+                        masks[idx] = np.clip(masks[idx], 0, 1).astype(np.bool)
+                    label[mask] = class_id
+        num = 0
+        for i, cls_name in enumerate(self.class_names):
+            if num_class_obj[cls_name] > 0:
+                for _ in range(num_class_obj[cls_name]):
+                    if num >= len(texts):
+                        break
+                    texts[num] = f"a photo with a {cls_name}"
+                    num += 1
+        classes = np.array(classes)
+        instances.gt_classes = torch.tensor(classes, dtype=torch.int64)
+        if len(masks) == 0:
+            # Some image does not have annotation (all ignored)
+            instances.gt_masks = torch.zeros((0, pan_seg_gt.shape[-2], pan_seg_gt.shape[-1]))
+            instances.gt_bboxes = torch.zeros((0, 4))
+        else:
+            masks = BitMasks(
+                torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks])
+            )
+            instances.gt_masks = masks.tensor
+            # Placeholder bounding boxes for stuff regions. Note that these are not used during training.
+            instances.gt_bboxes = torch.stack([torch.tensor([0., 0., 1., 1.])] * instances.gt_masks.shape[0])
+        return instances, texts, label
+    def _get_instance_dict(self, pan_seg_gt, image_shape, segments_info, num_class_obj):
+        pan_seg_gt = pan_seg_gt.numpy()
+        instances = Instances(image_shape)
+        classes = []
+        texts = ["an instance photo"] * self.num_queries
+        masks = []
+        label = np.ones_like(pan_seg_gt) * self.ignore_label
+        for segment_info in segments_info:
+            class_id = segment_info["category_id"]
+            if class_id in self.things:
+                if not segment_info["iscrowd"]:
+                    mask = pan_seg_gt == segment_info["id"]
+                    if not np.all(mask == False):
+                        cls_name = self.class_names[class_id]
+                        classes.append(class_id)
+                        masks.append(mask)
+                        num_class_obj[cls_name] += 1
+                        label[mask] = class_id
+        num = 0
+        for i, cls_name in enumerate(self.class_names):
+            if num_class_obj[cls_name] > 0:
+                for _ in range(num_class_obj[cls_name]):
+                    if num >= len(texts):
+                        break
+                    texts[num] = f"a photo with a {cls_name}"
+                    num += 1
+        classes = np.array(classes)
+        instances.gt_classes = torch.tensor(classes, dtype=torch.int64)
+        if len(masks) == 0:
+            # Some image does not have annotation (all ignored)
+            instances.gt_masks = torch.zeros((0, pan_seg_gt.shape[-2], pan_seg_gt.shape[-1]))
+            instances.gt_bboxes = torch.zeros((0, 4))
+        else:
+            masks = BitMasks(
+                torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks])
+            )
+            instances.gt_masks = masks.tensor
+            instances.gt_bboxes = masks_to_boxes(instances.gt_masks)
+        return instances, texts, label
+    def _get_panoptic_dict(self, pan_seg_gt, image_shape, segments_info, num_class_obj):
+        pan_seg_gt = pan_seg_gt.numpy()
+        instances = Instances(image_shape)
+        classes = []
+        texts = ["a panoptic photo"] * self.num_queries
+        masks = []
+        label = np.ones_like(pan_seg_gt) * self.ignore_label
+        for segment_info in segments_info:
+            class_id = segment_info["category_id"]
+            if not segment_info["iscrowd"]:
+                mask = pan_seg_gt == segment_info["id"]
+                if not np.all(mask == False):
+                    cls_name = self.class_names[class_id]
+                    classes.append(class_id)
+                    masks.append(mask)
+                    num_class_obj[cls_name] += 1
+                    label[mask] = class_id
+        num = 0
+        for i, cls_name in enumerate(self.class_names):
+            if num_class_obj[cls_name] > 0:
+                for _ in range(num_class_obj[cls_name]):
+                    if num >= len(texts):
+                        break
+                    texts[num] = f"a photo with a {cls_name}"
+                    num += 1
+        classes = np.array(classes)
+        instances.gt_classes = torch.tensor(classes, dtype=torch.int64)
+        if len(masks) == 0:
+            # Some image does not have annotation (all ignored)
+            instances.gt_masks = torch.zeros((0, pan_seg_gt.shape[-2], pan_seg_gt.shape[-1]))
+            instances.gt_bboxes = torch.zeros((0, 4))
+        else:
+            masks = BitMasks(
+                torch.stack([torch.from_numpy(np.ascontiguousarray(x.copy())) for x in masks])
+            )
+            instances.gt_masks = masks.tensor
+            instances.gt_bboxes = masks_to_boxes(instances.gt_masks)
+            for i in range(instances.gt_classes.shape[0]):
+                # Placeholder bounding boxes for stuff regions. Note that these are not used during training.
+                if instances.gt_classes[i].item() not in self.things:
+                    instances.gt_bboxes[i] = torch.tensor([0., 0., 1., 1.])
+        return instances, texts, label
+    def __call__(self, dataset_dict):
+        """
+        Args:
+            dataset_dict (dict): Metadata of one image, in Detectron2 Dataset format.
+        Returns:
+            dict: a format that builtin models in detectron2 accept
+        """
+        assert self.is_train, "OneFormerUnifiedDatasetMapper should only be used for training!"
+        dataset_dict = copy.deepcopy(dataset_dict)  # it will be modified by code below
+        image = utils.read_image(dataset_dict["file_name"], format=self.img_format)
+        utils.check_image_size(dataset_dict, image)
+        # semantic segmentation
+        if "sem_seg_file_name" in dataset_dict:
+            # PyTorch transformation not implemented for uint16, so converting it to double first
+            sem_seg_gt = utils.read_image(dataset_dict.pop("sem_seg_file_name")).astype("double")
+        else:
+            sem_seg_gt = None
+        # panoptic segmentation
+        if "pan_seg_file_name" in dataset_dict:
+            pan_seg_gt = utils.read_image(dataset_dict.pop("pan_seg_file_name"), "RGB")
+            segments_info = dataset_dict["segments_info"]
+        else:
+            pan_seg_gt = None
+            segments_info = None
+        if pan_seg_gt is None:
+            raise ValueError(
+                "Cannot find 'pan_seg_file_name' for panoptic segmentation dataset {}.".format(
+                    dataset_dict["file_name"]
+                )
+            )
+        aug_input = T.AugInput(image, sem_seg=sem_seg_gt)
+        aug_input, transforms = T.apply_transform_gens(self.tfm_gens, aug_input)
+        image = aug_input.image
+        if sem_seg_gt is not None:
+            sem_seg_gt = aug_input.sem_seg
+        # apply the same transformation to panoptic segmentation
+        pan_seg_gt = transforms.apply_segmentation(pan_seg_gt)
+        from panopticapi.utils import rgb2id
+        pan_seg_gt = rgb2id(pan_seg_gt)
+        # Pad image and segmentation label here!
+        image = torch.as_tensor(np.ascontiguousarray(image.transpose(2, 0, 1)))
+        if sem_seg_gt is not None:
+            sem_seg_gt = torch.as_tensor(sem_seg_gt.astype("long"))
+        pan_seg_gt = torch.as_tensor(pan_seg_gt.astype("long"))
+        if self.size_divisibility > 0:
+            image_size = (image.shape[-2], image.shape[-1])
+            padding_size = [
+                0,
+                self.size_divisibility - image_size[1],
+                0,
+                self.size_divisibility - image_size[0],
+            ]
+            image = F.pad(image, padding_size, value=128).contiguous()
+            if sem_seg_gt is not None:
+                sem_seg_gt = F.pad(sem_seg_gt, padding_size, value=self.ignore_label).contiguous()
+            pan_seg_gt = F.pad(
+                pan_seg_gt, padding_size, value=0
+            ).contiguous()  # 0 is the VOID panoptic label
+        image_shape = (image.shape[-2], image.shape[-1])  # h, w
+        # Pytorch's dataloader is efficient on torch.Tensor due to shared-memory,
+        # but not efficient on large generic data structures due to the use of pickle & mp.Queue.
+        # Therefore it's important to use torch.Tensor.
+        dataset_dict["image"] = image
+        if "annotations" in dataset_dict:
+            raise ValueError("Pemantic segmentation dataset should not have 'annotations'.")
+        prob_task = np.random.uniform(0,1.)
+        num_class_obj = {}
+        for name in self.class_names:
+            num_class_obj[name] = 0
+        if prob_task < self.semantic_prob:
+            task = "The task is semantic"
+            instances, text, sem_seg = self._get_semantic_dict(pan_seg_gt, image_shape, segments_info, num_class_obj)
+        elif prob_task < self.instance_prob:
+            task = "The task is instance"
+            instances, text, sem_seg = self._get_instance_dict(pan_seg_gt, image_shape, segments_info, num_class_obj)
+        else:
+            task = "The task is panoptic"
+            instances, text, sem_seg = self._get_panoptic_dict(pan_seg_gt, image_shape, segments_info, num_class_obj)
+        dataset_dict["sem_seg"] = torch.from_numpy(sem_seg).long()
+        dataset_dict["instances"] = instances
+        dataset_dict["orig_shape"] = image_shape
+        dataset_dict["task"] = task
+        dataset_dict["text"] = text
+        dataset_dict["thing_ids"] = self.things
+        return dataset_dict

oneformer/data/datasets/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from . import (
+    register_ade20k_panoptic,
+    register_cityscapes_panoptic,
+    register_coco_panoptic_annos_semseg,
+    register_ade20k_instance,
+    register_coco_panoptic2instance,
+)

oneformer/data/datasets/register_ade20k_instance.py ADDED Viewed

	@@ -0,0 +1,56 @@

+# ------------------------------------------------------------------------------
+# Reference: https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/data/datasets/register_ade20k_instance.py
+# ------------------------------------------------------------------------------
+import json
+import logging
+import numpy as np
+import os
+from PIL import Image
+from detectron2.data import DatasetCatalog, MetadataCatalog
+from detectron2.data.datasets.coco import load_coco_json, register_coco_instances
+from detectron2.utils.file_io import PathManager
+ADE_CATEGORIES = [{'id': 7, 'name': 'bed'}, {'id': 8, 'name': 'windowpane'}, {'id': 10, 'name': 'cabinet'}, {'id': 12, 'name': 'person'}, {'id': 14, 'name': 'door'}, {'id': 15, 'name': 'table'}, {'id': 18, 'name': 'curtain'}, {'id': 19, 'name': 'chair'}, {'id': 20, 'name': 'car'}, {'id': 22, 'name': 'painting'}, {'id': 23, 'name': 'sofa'}, {'id': 24, 'name': 'shelf'}, {'id': 27, 'name': 'mirror'}, {'id': 30, 'name': 'armchair'}, {'id': 31, 'name': 'seat'}, {'id': 32, 'name': 'fence'}, {'id': 33, 'name': 'desk'}, {'id': 35, 'name': 'wardrobe'}, {'id': 36, 'name': 'lamp'}, {'id': 37, 'name': 'bathtub'}, {'id': 38, 'name': 'railing'}, {'id': 39, 'name': 'cushion'}, {'id': 41, 'name': 'box'}, {'id': 42, 'name': 'column'}, {'id': 43, 'name': 'signboard'}, {'id': 44, 'name': 'chest of drawers'}, {'id': 45, 'name': 'counter'}, {'id': 47, 'name': 'sink'}, {'id': 49, 'name': 'fireplace'}, {'id': 50, 'name': 'refrigerator'}, {'id': 53, 'name': 'stairs'}, {'id': 55, 'name': 'case'}, {'id': 56, 'name': 'pool table'}, {'id': 57, 'name': 'pillow'}, {'id': 58, 'name': 'screen door'}, {'id': 62, 'name': 'bookcase'}, {'id': 64, 'name': 'coffee table'}, {'id': 65, 'name': 'toilet'}, {'id': 66, 'name': 'flower'}, {'id': 67, 'name': 'book'}, {'id': 69, 'name': 'bench'}, {'id': 70, 'name': 'countertop'}, {'id': 71, 'name': 'stove'}, {'id': 72, 'name': 'palm'}, {'id': 73, 'name': 'kitchen island'}, {'id': 74, 'name': 'computer'}, {'id': 75, 'name': 'swivel chair'}, {'id': 76, 'name': 'boat'}, {'id': 78, 'name': 'arcade machine'}, {'id': 80, 'name': 'bus'}, {'id': 81, 'name': 'towel'}, {'id': 82, 'name': 'light'}, {'id': 83, 'name': 'truck'}, {'id': 85, 'name': 'chandelier'}, {'id': 86, 'name': 'awning'}, {'id': 87, 'name': 'streetlight'}, {'id': 88, 'name': 'booth'}, {'id': 89, 'name': 'television receiver'}, {'id': 90, 'name': 'airplane'}, {'id': 92, 'name': 'apparel'}, {'id': 93, 'name': 'pole'}, {'id': 95, 'name': 'bannister'}, {'id': 97, 'name': 'ottoman'}, {'id': 98, 'name': 'bottle'}, {'id': 102, 'name': 'van'}, {'id': 103, 'name': 'ship'}, {'id': 104, 'name': 'fountain'}, {'id': 107, 'name': 'washer'}, {'id': 108, 'name': 'plaything'}, {'id': 110, 'name': 'stool'}, {'id': 111, 'name': 'barrel'}, {'id': 112, 'name': 'basket'}, {'id': 115, 'name': 'bag'}, {'id': 116, 'name': 'minibike'}, {'id': 118, 'name': 'oven'}, {'id': 119, 'name': 'ball'}, {'id': 120, 'name': 'food'}, {'id': 121, 'name': 'step'}, {'id': 123, 'name': 'trade name'}, {'id': 124, 'name': 'microwave'}, {'id': 125, 'name': 'pot'}, {'id': 126, 'name': 'animal'}, {'id': 127, 'name': 'bicycle'}, {'id': 129, 'name': 'dishwasher'}, {'id': 130, 'name': 'screen'}, {'id': 132, 'name': 'sculpture'}, {'id': 133, 'name': 'hood'}, {'id': 134, 'name': 'sconce'}, {'id': 135, 'name': 'vase'}, {'id': 136, 'name': 'traffic light'}, {'id': 137, 'name': 'tray'}, {'id': 138, 'name': 'ashcan'}, {'id': 139, 'name': 'fan'}, {'id': 142, 'name': 'plate'}, {'id': 143, 'name': 'monitor'}, {'id': 144, 'name': 'bulletin board'}, {'id': 146, 'name': 'radiator'}, {'id': 147, 'name': 'glass'}, {'id': 148, 'name': 'clock'}, {'id': 149, 'name': 'flag'}]
+_PREDEFINED_SPLITS = {
+    # point annotations without masks
+    "ade20k_instance_train": (
+        "ADEChallengeData2016/images/training",
+        "ADEChallengeData2016/ade20k_instance_train.json",
+    ),
+    "ade20k_instance_val": (
+        "ADEChallengeData2016/images/validation",
+        "ADEChallengeData2016/ade20k_instance_val.json",
+    ),
+}
+def _get_ade_instances_meta():
+    thing_ids = [k["id"] for k in ADE_CATEGORIES]
+    assert len(thing_ids) == 100, len(thing_ids)
+    # Mapping from the incontiguous ADE category id to an id in [0, 99]
+    thing_dataset_id_to_contiguous_id = {k: i for i, k in enumerate(thing_ids)}
+    thing_classes = [k["name"] for k in ADE_CATEGORIES]
+    ret = {
+        "thing_dataset_id_to_contiguous_id": thing_dataset_id_to_contiguous_id,
+        "thing_classes": thing_classes,
+    }
+    return ret
+def register_all_ade20k_instance(root):
+    for key, (image_root, json_file) in _PREDEFINED_SPLITS.items():
+        # Assume pre-defined datasets live in `./datasets`.
+        register_coco_instances(
+            key,
+            _get_ade_instances_meta(),
+            os.path.join(root, json_file) if "://" not in json_file else json_file,
+            os.path.join(root, image_root),
+        )
+_root = os.getenv("DETECTRON2_DATASETS", "datasets")
+register_all_ade20k_instance(_root)

oneformer/data/datasets/register_ade20k_panoptic.py ADDED Viewed

	@@ -0,0 +1,394 @@

+# ------------------------------------------------------------------------------
+# Reference: https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/data/datasets/register_ade20k_panoptic.py
+# Modified by Jitesh Jain (https://github.com/praeclarumjj3)
+# ------------------------------------------------------------------------------
+import json
+import os
+from detectron2.data import DatasetCatalog, MetadataCatalog
+from detectron2.utils.file_io import PathManager
+ADE20K_150_CATEGORIES = [
+    {"color": [120, 120, 120], "id": 0, "isthing": 0, "name": "wall"},
+    {"color": [180, 120, 120], "id": 1, "isthing": 0, "name": "building"},
+    {"color": [6, 230, 230], "id": 2, "isthing": 0, "name": "sky"},
+    {"color": [80, 50, 50], "id": 3, "isthing": 0, "name": "floor"},
+    {"color": [4, 200, 3], "id": 4, "isthing": 0, "name": "tree"},
+    {"color": [120, 120, 80], "id": 5, "isthing": 0, "name": "ceiling"},
+    {"color": [140, 140, 140], "id": 6, "isthing": 0, "name": "road, route"},
+    {"color": [204, 5, 255], "id": 7, "isthing": 1, "name": "bed"},
+    {"color": [230, 230, 230], "id": 8, "isthing": 1, "name": "window "},
+    {"color": [4, 250, 7], "id": 9, "isthing": 0, "name": "grass"},
+    {"color": [224, 5, 255], "id": 10, "isthing": 1, "name": "cabinet"},
+    {"color": [235, 255, 7], "id": 11, "isthing": 0, "name": "sidewalk, pavement"},
+    {"color": [150, 5, 61], "id": 12, "isthing": 1, "name": "person"},
+    {"color": [120, 120, 70], "id": 13, "isthing": 0, "name": "earth, ground"},
+    {"color": [8, 255, 51], "id": 14, "isthing": 1, "name": "door"},
+    {"color": [255, 6, 82], "id": 15, "isthing": 1, "name": "table"},
+    {"color": [143, 255, 140], "id": 16, "isthing": 0, "name": "mountain, mount"},
+    {"color": [204, 255, 4], "id": 17, "isthing": 0, "name": "plant"},
+    {"color": [255, 51, 7], "id": 18, "isthing": 1, "name": "curtain"},
+    {"color": [204, 70, 3], "id": 19, "isthing": 1, "name": "chair"},
+    {"color": [0, 102, 200], "id": 20, "isthing": 1, "name": "car"},
+    {"color": [61, 230, 250], "id": 21, "isthing": 0, "name": "water"},
+    {"color": [255, 6, 51], "id": 22, "isthing": 1, "name": "painting, picture"},
+    {"color": [11, 102, 255], "id": 23, "isthing": 1, "name": "sofa"},
+    {"color": [255, 7, 71], "id": 24, "isthing": 1, "name": "shelf"},
+    {"color": [255, 9, 224], "id": 25, "isthing": 0, "name": "house"},
+    {"color": [9, 7, 230], "id": 26, "isthing": 0, "name": "sea"},
+    {"color": [220, 220, 220], "id": 27, "isthing": 1, "name": "mirror"},
+    {"color": [255, 9, 92], "id": 28, "isthing": 0, "name": "rug"},
+    {"color": [112, 9, 255], "id": 29, "isthing": 0, "name": "field"},
+    {"color": [8, 255, 214], "id": 30, "isthing": 1, "name": "armchair"},
+    {"color": [7, 255, 224], "id": 31, "isthing": 1, "name": "seat"},
+    {"color": [255, 184, 6], "id": 32, "isthing": 1, "name": "fence"},
+    {"color": [10, 255, 71], "id": 33, "isthing": 1, "name": "desk"},
+    {"color": [255, 41, 10], "id": 34, "isthing": 0, "name": "rock, stone"},
+    {"color": [7, 255, 255], "id": 35, "isthing": 1, "name": "wardrobe, closet, press"},
+    {"color": [224, 255, 8], "id": 36, "isthing": 1, "name": "lamp"},
+    {"color": [102, 8, 255], "id": 37, "isthing": 1, "name": "tub"},
+    {"color": [255, 61, 6], "id": 38, "isthing": 1, "name": "rail"},
+    {"color": [255, 194, 7], "id": 39, "isthing": 1, "name": "cushion"},
+    {"color": [255, 122, 8], "id": 40, "isthing": 0, "name": "base, pedestal, stand"},
+    {"color": [0, 255, 20], "id": 41, "isthing": 1, "name": "box"},
+    {"color": [255, 8, 41], "id": 42, "isthing": 1, "name": "column, pillar"},
+    {"color": [255, 5, 153], "id": 43, "isthing": 1, "name": "signboard, sign"},
+    {
+        "color": [6, 51, 255],
+        "id": 44,
+        "isthing": 1,
+        "name": "chest of drawers, chest, bureau, dresser",
+    },
+    {"color": [235, 12, 255], "id": 45, "isthing": 1, "name": "counter"},
+    {"color": [160, 150, 20], "id": 46, "isthing": 0, "name": "sand"},
+    {"color": [0, 163, 255], "id": 47, "isthing": 1, "name": "sink"},
+    {"color": [140, 140, 140], "id": 48, "isthing": 0, "name": "skyscraper"},
+    {"color": [250, 10, 15], "id": 49, "isthing": 1, "name": "fireplace"},
+    {"color": [20, 255, 0], "id": 50, "isthing": 1, "name": "refrigerator, icebox"},
+    {"color": [31, 255, 0], "id": 51, "isthing": 0, "name": "grandstand, covered stand"},
+    {"color": [255, 31, 0], "id": 52, "isthing": 0, "name": "path"},
+    {"color": [255, 224, 0], "id": 53, "isthing": 1, "name": "stairs"},
+    {"color": [153, 255, 0], "id": 54, "isthing": 0, "name": "runway"},
+    {"color": [0, 0, 255], "id": 55, "isthing": 1, "name": "case, display case, showcase, vitrine"},
+    {
+        "color": [255, 71, 0],
+        "id": 56,
+        "isthing": 1,
+        "name": "pool table, billiard table, snooker table",
+    },
+    {"color": [0, 235, 255], "id": 57, "isthing": 1, "name": "pillow"},
+    {"color": [0, 173, 255], "id": 58, "isthing": 1, "name": "screen door, screen"},
+    {"color": [31, 0, 255], "id": 59, "isthing": 0, "name": "stairway, staircase"},
+    {"color": [11, 200, 200], "id": 60, "isthing": 0, "name": "river"},
+    {"color": [255, 82, 0], "id": 61, "isthing": 0, "name": "bridge, span"},
+    {"color": [0, 255, 245], "id": 62, "isthing": 1, "name": "bookcase"},
+    {"color": [0, 61, 255], "id": 63, "isthing": 0, "name": "blind, screen"},
+    {"color": [0, 255, 112], "id": 64, "isthing": 1, "name": "coffee table"},
+    {
+        "color": [0, 255, 133],
+        "id": 65,
+        "isthing": 1,
+        "name": "toilet, can, commode, crapper, pot, potty, stool, throne",
+    },
+    {"color": [255, 0, 0], "id": 66, "isthing": 1, "name": "flower"},
+    {"color": [255, 163, 0], "id": 67, "isthing": 1, "name": "book"},
+    {"color": [255, 102, 0], "id": 68, "isthing": 0, "name": "hill"},
+    {"color": [194, 255, 0], "id": 69, "isthing": 1, "name": "bench"},
+    {"color": [0, 143, 255], "id": 70, "isthing": 1, "name": "countertop"},
+    {"color": [51, 255, 0], "id": 71, "isthing": 1, "name": "stove"},
+    {"color": [0, 82, 255], "id": 72, "isthing": 1, "name": "palm, palm tree"},
+    {"color": [0, 255, 41], "id": 73, "isthing": 1, "name": "kitchen island"},
+    {"color": [0, 255, 173], "id": 74, "isthing": 1, "name": "computer"},
+    {"color": [10, 0, 255], "id": 75, "isthing": 1, "name": "swivel chair"},
+    {"color": [173, 255, 0], "id": 76, "isthing": 1, "name": "boat"},
+    {"color": [0, 255, 153], "id": 77, "isthing": 0, "name": "bar"},
+    {"color": [255, 92, 0], "id": 78, "isthing": 1, "name": "arcade machine"},
+    {"color": [255, 0, 255], "id": 79, "isthing": 0, "name": "hovel, hut, hutch, shack, shanty"},
+    {"color": [255, 0, 245], "id": 80, "isthing": 1, "name": "bus"},
+    {"color": [255, 0, 102], "id": 81, "isthing": 1, "name": "towel"},
+    {"color": [255, 173, 0], "id": 82, "isthing": 1, "name": "light"},
+    {"color": [255, 0, 20], "id": 83, "isthing": 1, "name": "truck"},
+    {"color": [255, 184, 184], "id": 84, "isthing": 0, "name": "tower"},
+    {"color": [0, 31, 255], "id": 85, "isthing": 1, "name": "chandelier"},
+    {"color": [0, 255, 61], "id": 86, "isthing": 1, "name": "awning, sunshade, sunblind"},
+    {"color": [0, 71, 255], "id": 87, "isthing": 1, "name": "street lamp"},
+    {"color": [255, 0, 204], "id": 88, "isthing": 1, "name": "booth"},
+    {"color": [0, 255, 194], "id": 89, "isthing": 1, "name": "tv"},
+    {"color": [0, 255, 82], "id": 90, "isthing": 1, "name": "plane"},
+    {"color": [0, 10, 255], "id": 91, "isthing": 0, "name": "dirt track"},
+    {"color": [0, 112, 255], "id": 92, "isthing": 1, "name": "clothes"},
+    {"color": [51, 0, 255], "id": 93, "isthing": 1, "name": "pole"},
+    {"color": [0, 194, 255], "id": 94, "isthing": 0, "name": "land, ground, soil"},
+    {
+        "color": [0, 122, 255],
+        "id": 95,
+        "isthing": 1,
+        "name": "bannister, banister, balustrade, balusters, handrail",
+    },
+    {
+        "color": [0, 255, 163],
+        "id": 96,
+        "isthing": 0,
+        "name": "escalator, moving staircase, moving stairway",
+    },
+    {
+        "color": [255, 153, 0],
+        "id": 97,
+        "isthing": 1,
+        "name": "ottoman, pouf, pouffe, puff, hassock",
+    },
+    {"color": [0, 255, 10], "id": 98, "isthing": 1, "name": "bottle"},
+    {"color": [255, 112, 0], "id": 99, "isthing": 0, "name": "buffet, counter, sideboard"},
+    {
+        "color": [143, 255, 0],
+        "id": 100,
+        "isthing": 0,
+        "name": "poster, posting, placard, notice, bill, card",
+    },
+    {"color": [82, 0, 255], "id": 101, "isthing": 0, "name": "stage"},
+    {"color": [163, 255, 0], "id": 102, "isthing": 1, "name": "van"},
+    {"color": [255, 235, 0], "id": 103, "isthing": 1, "name": "ship"},
+    {"color": [8, 184, 170], "id": 104, "isthing": 1, "name": "fountain"},
+    {
+        "color": [133, 0, 255],
+        "id": 105,
+        "isthing": 0,
+        "name": "conveyer belt, conveyor belt, conveyer, conveyor, transporter",
+    },
+    {"color": [0, 255, 92], "id": 106, "isthing": 0, "name": "canopy"},
+    {
+        "color": [184, 0, 255],
+        "id": 107,
+        "isthing": 1,
+        "name": "washer, automatic washer, washing machine",
+    },
+    {"color": [255, 0, 31], "id": 108, "isthing": 1, "name": "plaything, toy"},
+    {"color": [0, 184, 255], "id": 109, "isthing": 0, "name": "pool"},
+    {"color": [0, 214, 255], "id": 110, "isthing": 1, "name": "stool"},
+    {"color": [255, 0, 112], "id": 111, "isthing": 1, "name": "barrel, cask"},
+    {"color": [92, 255, 0], "id": 112, "isthing": 1, "name": "basket, handbasket"},
+    {"color": [0, 224, 255], "id": 113, "isthing": 0, "name": "falls"},
+    {"color": [112, 224, 255], "id": 114, "isthing": 0, "name": "tent"},
+    {"color": [70, 184, 160], "id": 115, "isthing": 1, "name": "bag"},
+    {"color": [163, 0, 255], "id": 116, "isthing": 1, "name": "minibike, motorbike"},
+    {"color": [153, 0, 255], "id": 117, "isthing": 0, "name": "cradle"},
+    {"color": [71, 255, 0], "id": 118, "isthing": 1, "name": "oven"},
+    {"color": [255, 0, 163], "id": 119, "isthing": 1, "name": "ball"},
+    {"color": [255, 204, 0], "id": 120, "isthing": 1, "name": "food, solid food"},
+    {"color": [255, 0, 143], "id": 121, "isthing": 1, "name": "step, stair"},
+    {"color": [0, 255, 235], "id": 122, "isthing": 0, "name": "tank, storage tank"},
+    {"color": [133, 255, 0], "id": 123, "isthing": 1, "name": "trade name"},
+    {"color": [255, 0, 235], "id": 124, "isthing": 1, "name": "microwave"},
+    {"color": [245, 0, 255], "id": 125, "isthing": 1, "name": "pot"},
+    {"color": [255, 0, 122], "id": 126, "isthing": 1, "name": "animal"},
+    {"color": [255, 245, 0], "id": 127, "isthing": 1, "name": "bicycle"},
+    {"color": [10, 190, 212], "id": 128, "isthing": 0, "name": "lake"},
+    {"color": [214, 255, 0], "id": 129, "isthing": 1, "name": "dishwasher"},
+    {"color": [0, 204, 255], "id": 130, "isthing": 1, "name": "screen"},
+    {"color": [20, 0, 255], "id": 131, "isthing": 0, "name": "blanket, cover"},
+    {"color": [255, 255, 0], "id": 132, "isthing": 1, "name": "sculpture"},
+    {"color": [0, 153, 255], "id": 133, "isthing": 1, "name": "hood, exhaust hood"},
+    {"color": [0, 41, 255], "id": 134, "isthing": 1, "name": "sconce"},
+    {"color": [0, 255, 204], "id": 135, "isthing": 1, "name": "vase"},
+    {"color": [41, 0, 255], "id": 136, "isthing": 1, "name": "traffic light"},
+    {"color": [41, 255, 0], "id": 137, "isthing": 1, "name": "tray"},
+    {"color": [173, 0, 255], "id": 138, "isthing": 1, "name": "trash can"},
+    {"color": [0, 245, 255], "id": 139, "isthing": 1, "name": "fan"},
+    {"color": [71, 0, 255], "id": 140, "isthing": 0, "name": "pier"},
+    {"color": [122, 0, 255], "id": 141, "isthing": 0, "name": "crt screen"},
+    {"color": [0, 255, 184], "id": 142, "isthing": 1, "name": "plate"},
+    {"color": [0, 92, 255], "id": 143, "isthing": 1, "name": "monitor"},
+    {"color": [184, 255, 0], "id": 144, "isthing": 1, "name": "bulletin board"},
+    {"color": [0, 133, 255], "id": 145, "isthing": 0, "name": "shower"},
+    {"color": [255, 214, 0], "id": 146, "isthing": 1, "name": "radiator"},
+    {"color": [25, 194, 194], "id": 147, "isthing": 1, "name": "glass, drinking glass"},
+    {"color": [102, 255, 0], "id": 148, "isthing": 1, "name": "clock"},
+    {"color": [92, 0, 255], "id": 149, "isthing": 1, "name": "flag"},
+]
+ADE20k_COLORS = [k["color"] for k in ADE20K_150_CATEGORIES]
+MetadataCatalog.get("ade20k_sem_seg_train").set(
+    stuff_colors=ADE20k_COLORS[:],
+)
+MetadataCatalog.get("ade20k_sem_seg_val").set(
+    stuff_colors=ADE20k_COLORS[:],
+)
+def load_ade20k_panoptic_json(json_file, image_dir, gt_dir, semseg_dir, meta):
+    """
+    Args:
+        image_dir (str): path to the raw dataset. e.g., "~/coco/train2017".
+        gt_dir (str): path to the raw annotations. e.g., "~/coco/panoptic_train2017".
+        json_file (str): path to the json file. e.g., "~/coco/annotations/panoptic_train2017.json".
+    Returns:
+        list[dict]: a list of dicts in Detectron2 standard format. (See
+        `Using Custom Datasets </tutorials/datasets.html>`_ )
+    """
+    def _convert_category_id(segment_info, meta):
+        if segment_info["category_id"] in meta["thing_dataset_id_to_contiguous_id"]:
+            segment_info["category_id"] = meta["thing_dataset_id_to_contiguous_id"][
+                segment_info["category_id"]
+            ]
+            segment_info["isthing"] = True
+        else:
+            segment_info["category_id"] = meta["stuff_dataset_id_to_contiguous_id"][
+                segment_info["category_id"]
+            ]
+            segment_info["isthing"] = False
+        return segment_info
+    with PathManager.open(json_file) as f:
+        json_info = json.load(f)
+    ret = []
+    for ann in json_info["annotations"]:
+        image_id = ann["image_id"]
+        # TODO: currently we assume image and label has the same filename but
+        # different extension, and images have extension ".jpg" for COCO. Need
+        # to make image extension a user-provided argument if we extend this
+        # function to support other COCO-like datasets.
+        image_file = os.path.join(image_dir, os.path.splitext(ann["file_name"])[0] + ".jpg")
+        label_file = os.path.join(gt_dir, ann["file_name"])
+        sem_label_file = os.path.join(semseg_dir, ann["file_name"])
+        segments_info = [_convert_category_id(x, meta) for x in ann["segments_info"]]
+        ret.append(
+            {
+                "file_name": image_file,
+                "image_id": image_id,
+                "pan_seg_file_name": label_file,
+                "sem_seg_file_name": sem_label_file,
+                "segments_info": segments_info,
+            }
+        )
+    assert len(ret), f"No images found in {image_dir}!"
+    assert PathManager.isfile(ret[0]["file_name"]), ret[0]["file_name"]
+    assert PathManager.isfile(ret[0]["pan_seg_file_name"]), ret[0]["pan_seg_file_name"]
+    assert PathManager.isfile(ret[0]["sem_seg_file_name"]), ret[0]["sem_seg_file_name"]
+    return ret
+def register_ade20k_panoptic(
+    name, metadata, image_root, panoptic_root, semantic_root, panoptic_json, instances_json=None,
+):
+    """
+    Register a "standard" version of ADE20k panoptic segmentation dataset named `name`.
+    The dictionaries in this registered dataset follows detectron2's standard format.
+    Hence it's called "standard".
+    Args:
+        name (str): the name that identifies a dataset,
+            e.g. "ade20k_panoptic_train"
+        metadata (dict): extra metadata associated with this dataset.
+        image_root (str): directory which contains all the images
+        panoptic_root (str): directory which contains panoptic annotation images in COCO format
+        panoptic_json (str): path to the json panoptic annotation file in COCO format
+        sem_seg_root (none): not used, to be consistent with
+            `register_coco_panoptic_separated`.
+        instances_json (str): path to the json instance annotation file
+    """
+    panoptic_name = name
+    DatasetCatalog.register(
+        panoptic_name,
+        lambda: load_ade20k_panoptic_json(
+            panoptic_json, image_root, panoptic_root, semantic_root, metadata
+        ),
+    )
+    MetadataCatalog.get(panoptic_name).set(
+        panoptic_root=panoptic_root,
+        image_root=image_root,
+        panoptic_json=panoptic_json,
+        json_file=instances_json,
+        evaluator_type="ade20k_panoptic_seg",
+        ignore_label=255,
+        label_divisor=1000,
+        **metadata,
+    )
+_PREDEFINED_SPLITS_ADE20K_PANOPTIC = {
+    "ade20k_panoptic_train": (
+        "ADEChallengeData2016/images/training",
+        "ADEChallengeData2016/ade20k_panoptic_train",
+        "ADEChallengeData2016/ade20k_panoptic_train.json",
+        "ADEChallengeData2016/annotations_detectron2/training",
+        "ADEChallengeData2016/ade20k_instance_train.json",
+    ),
+    "ade20k_panoptic_val": (
+        "ADEChallengeData2016/images/validation",
+        "ADEChallengeData2016/ade20k_panoptic_val",
+        "ADEChallengeData2016/ade20k_panoptic_val.json",
+        "ADEChallengeData2016/annotations_detectron2/validation",
+        "ADEChallengeData2016/ade20k_instance_val.json",
+    ),
+}
+def get_metadata():
+    meta = {}
+    # The following metadata maps contiguous id from [0, #thing categories +
+    # #stuff categories) to their names and colors. We have to replica of the
+    # same name and color under "thing_*" and "stuff_*" because the current
+    # visualization function in D2 handles thing and class classes differently
+    # due to some heuristic used in Panoptic FPN. We keep the same naming to
+    # enable reusing existing visualization functions.
+    thing_classes = [k["name"] for k in ADE20K_150_CATEGORIES if k["isthing"] == 1]
+    thing_colors = [k["color"] for k in ADE20K_150_CATEGORIES if k["isthing"] == 1]
+    stuff_classes = [k["name"] for k in ADE20K_150_CATEGORIES]
+    stuff_colors = [k["color"] for k in ADE20K_150_CATEGORIES]
+    meta["thing_classes"] = thing_classes
+    meta["thing_colors"] = thing_colors
+    meta["stuff_classes"] = stuff_classes
+    meta["stuff_colors"] = stuff_colors
+    # Convert category id for training:
+    #   category id: like semantic segmentation, it is the class id for each
+    #   pixel. Since there are some classes not used in evaluation, the category
+    #   id is not always contiguous and thus we have two set of category ids:
+    #       - original category id: category id in the original dataset, mainly
+    #           used for evaluation.
+    #       - contiguous category id: [0, #classes), in order to train the linear
+    #           softmax classifier.
+    thing_dataset_id_to_contiguous_id = {}
+    stuff_dataset_id_to_contiguous_id = {}
+    for i, cat in enumerate(ADE20K_150_CATEGORIES):
+        if cat["isthing"]:
+            thing_dataset_id_to_contiguous_id[cat["id"]] = i
+        # else:
+        #     stuff_dataset_id_to_contiguous_id[cat["id"]] = i
+        # in order to use sem_seg evaluator
+        stuff_dataset_id_to_contiguous_id[cat["id"]] = i
+    meta["thing_dataset_id_to_contiguous_id"] = thing_dataset_id_to_contiguous_id
+    meta["stuff_dataset_id_to_contiguous_id"] = stuff_dataset_id_to_contiguous_id
+    return meta
+def register_all_ade20k_panoptic(root):
+    metadata = get_metadata()
+    for (
+        prefix,
+        (image_root, panoptic_root, panoptic_json, semantic_root, instance_json),
+    ) in _PREDEFINED_SPLITS_ADE20K_PANOPTIC.items():
+        # The "standard" version of COCO panoptic segmentation dataset,
+        # e.g. used by Panoptic-DeepLab
+        register_ade20k_panoptic(
+            prefix,
+            metadata,
+            os.path.join(root, image_root),
+            os.path.join(root, panoptic_root),
+            os.path.join(root, semantic_root),
+            os.path.join(root, panoptic_json),
+            os.path.join(root, instance_json),
+        )
+_root = os.getenv("DETECTRON2_DATASETS", "datasets")
+register_all_ade20k_panoptic(_root)

oneformer/data/datasets/register_cityscapes_panoptic.py ADDED Viewed

	@@ -0,0 +1,199 @@

+# ------------------------------------------------------------------------------
+# Reference: https://github.com/facebookresearch/detectron2/blob/main/detectron2/data/datasets/cityscapes_panoptic.py
+# Modified by Jitesh Jain (https://github.com/praeclarumjj3)
+# ------------------------------------------------------------------------------
+import json
+import logging
+import os
+from detectron2.data import DatasetCatalog, MetadataCatalog
+from detectron2.data.datasets.builtin_meta import CITYSCAPES_CATEGORIES
+from detectron2.utils.file_io import PathManager
+"""
+This file contains functions to register the Cityscapes panoptic dataset to the DatasetCatalog.
+"""
+logger = logging.getLogger(__name__)
+def get_cityscapes_panoptic_files(image_dir, gt_dir, json_info):
+    files = []
+    # scan through the directory
+    cities = PathManager.ls(image_dir)
+    logger.info(f"{len(cities)} cities found in '{image_dir}'.")
+    image_dict = {}
+    for city in cities:
+        city_img_dir = os.path.join(image_dir, city)
+        for basename in PathManager.ls(city_img_dir):
+            image_file = os.path.join(city_img_dir, basename)
+            suffix = "_leftImg8bit.png"
+            assert basename.endswith(suffix), basename
+            basename = os.path.basename(basename)[: -len(suffix)]
+            image_dict[basename] = image_file
+    for ann in json_info["annotations"]:
+        image_file = image_dict.get(ann["image_id"], None)
+        assert image_file is not None, "No image {} found for annotation {}".format(
+            ann["image_id"], ann["file_name"]
+        )
+        label_file = os.path.join(gt_dir, ann["file_name"])
+        segments_info = ann["segments_info"]
+        files.append((image_file, label_file, segments_info))
+    assert len(files), "No images found in {}".format(image_dir)
+    assert PathManager.isfile(files[0][0]), files[0][0]
+    assert PathManager.isfile(files[0][1]), files[0][1]
+    return files
+def load_cityscapes_panoptic(image_dir, gt_dir, gt_json, meta):
+    """
+    Args:
+        image_dir (str): path to the raw dataset. e.g., "~/cityscapes/leftImg8bit/train".
+        gt_dir (str): path to the raw annotations. e.g.,
+            "~/cityscapes/gtFine/cityscapes_panoptic_train".
+        gt_json (str): path to the json file. e.g.,
+            "~/cityscapes/gtFine/cityscapes_panoptic_train.json".
+        meta (dict): dictionary containing "thing_dataset_id_to_contiguous_id"
+            and "stuff_dataset_id_to_contiguous_id" to map category ids to
+            contiguous ids for training.
+    Returns:
+        list[dict]: a list of dicts in Detectron2 standard format. (See
+        `Using Custom Datasets </tutorials/datasets.html>`_ )
+    """
+    def _convert_category_id(segment_info, meta):
+        if segment_info["category_id"] in meta["thing_dataset_id_to_contiguous_id"]:
+            segment_info["category_id"] = meta["thing_dataset_id_to_contiguous_id"][
+                segment_info["category_id"]
+            ]
+        else:
+            segment_info["category_id"] = meta["stuff_dataset_id_to_contiguous_id"][
+                segment_info["category_id"]
+            ]
+        return segment_info
+    assert os.path.exists(
+        gt_json
+    ), "Please run `python cityscapesscripts/preparation/createPanopticImgs.py` to generate label files."  # noqa
+    with open(gt_json) as f:
+        json_info = json.load(f)
+    files = get_cityscapes_panoptic_files(image_dir, gt_dir, json_info)
+    ret = []
+    for image_file, label_file, segments_info in files:
+        sem_label_file = (
+            image_file.replace("leftImg8bit", "gtFine").split(".")[0] + "_labelTrainIds.png"
+        )
+        segments_info = [_convert_category_id(x, meta) for x in segments_info]
+        ret.append(
+            {
+                "file_name": image_file,
+                "image_id": "_".join(
+                    os.path.splitext(os.path.basename(image_file))[0].split("_")[:3]
+                ),
+                "sem_seg_file_name": sem_label_file,
+                "pan_seg_file_name": label_file,
+                "segments_info": segments_info,
+            }
+        )
+    assert len(ret), f"No images found in {image_dir}!"
+    assert PathManager.isfile(
+        ret[0]["sem_seg_file_name"]
+    ), "Please generate labelTrainIds.png with cityscapesscripts/preparation/createTrainIdLabelImgs.py"  # noqa
+    assert PathManager.isfile(
+        ret[0]["pan_seg_file_name"]
+    ), "Please generate panoptic annotation with python cityscapesscripts/preparation/createPanopticImgs.py"  # noqa
+    return ret
+_RAW_CITYSCAPES_PANOPTIC_SPLITS = {
+    "cityscapes_fine_panoptic_train": (
+        "cityscapes/leftImg8bit/train",
+        "cityscapes/gtFine/cityscapes_panoptic_train",
+        "cityscapes/gtFine/cityscapes_panoptic_train.json",
+    ),
+    "cityscapes_fine_panoptic_val": (
+        "cityscapes/leftImg8bit/val",
+        "cityscapes/gtFine/cityscapes_panoptic_val",
+        "cityscapes/gtFine/cityscapes_panoptic_val.json",
+    ),
+    # "cityscapes_fine_panoptic_test": not supported yet
+}
+def register_all_cityscapes_panoptic(root):
+    meta = {}
+    # The following metadata maps contiguous id from [0, #thing categories +
+    # #stuff categories) to their names and colors. We have to replica of the
+    # same name and color under "thing_*" and "stuff_*" because the current
+    # visualization function in D2 handles thing and class classes differently
+    # due to some heuristic used in Panoptic FPN. We keep the same naming to
+    # enable reusing existing visualization functions.
+    thing_classes = [k["name"] for k in CITYSCAPES_CATEGORIES]
+    thing_colors = [k["color"] for k in CITYSCAPES_CATEGORIES]
+    stuff_classes = [k["name"] for k in CITYSCAPES_CATEGORIES]
+    stuff_colors = [k["color"] for k in CITYSCAPES_CATEGORIES]
+    meta["thing_classes"] = thing_classes
+    meta["thing_colors"] = thing_colors
+    meta["stuff_classes"] = stuff_classes
+    meta["stuff_colors"] = stuff_colors
+    # There are three types of ids in cityscapes panoptic segmentation:
+    # (1) category id: like semantic segmentation, it is the class id for each
+    #   pixel. Since there are some classes not used in evaluation, the category
+    #   id is not always contiguous and thus we have two set of category ids:
+    #       - original category id: category id in the original dataset, mainly
+    #           used for evaluation.
+    #       - contiguous category id: [0, #classes), in order to train the classifier
+    # (2) instance id: this id is used to differentiate different instances from
+    #   the same category. For "stuff" classes, the instance id is always 0; for
+    #   "thing" classes, the instance id starts from 1 and 0 is reserved for
+    #   ignored instances (e.g. crowd annotation).
+    # (3) panoptic id: this is the compact id that encode both category and
+    #   instance id by: category_id * 1000 + instance_id.
+    thing_dataset_id_to_contiguous_id = {}
+    stuff_dataset_id_to_contiguous_id = {}
+    for k in CITYSCAPES_CATEGORIES:
+        if k["isthing"] == 1:
+            thing_dataset_id_to_contiguous_id[k["id"]] = k["trainId"]
+        else:
+            stuff_dataset_id_to_contiguous_id[k["id"]] = k["trainId"]
+    meta["thing_dataset_id_to_contiguous_id"] = thing_dataset_id_to_contiguous_id
+    meta["stuff_dataset_id_to_contiguous_id"] = stuff_dataset_id_to_contiguous_id
+    for key, (image_dir, gt_dir, gt_json) in _RAW_CITYSCAPES_PANOPTIC_SPLITS.items():
+        image_dir = os.path.join(root, image_dir)
+        gt_dir = os.path.join(root, gt_dir)
+        gt_json = os.path.join(root, gt_json)
+        if key in DatasetCatalog.list():
+            DatasetCatalog.remove(key)
+        DatasetCatalog.register(
+            key, lambda x=image_dir, y=gt_dir, z=gt_json: load_cityscapes_panoptic(x, y, z, meta)
+        )
+        MetadataCatalog.get(key).set(
+            panoptic_root=gt_dir,
+            image_root=image_dir,
+            panoptic_json=gt_json,
+            gt_dir=gt_dir.replace("cityscapes_panoptic_", ""),
+            evaluator_type="cityscapes_panoptic_seg",
+            ignore_label=255,
+            label_divisor=1000,
+            **meta,
+        )
+_root = os.getenv("DETECTRON2_DATASETS", "datasets")
+register_all_cityscapes_panoptic(_root)

oneformer/data/datasets/register_coco_panoptic2instance.py ADDED Viewed

	@@ -0,0 +1,44 @@

+# ------------------------------------------------------------------------------
+# Reference: https://github.com/facebookresearch/detectron2/blob/main/detectron2/data/datasets/builtin.py
+# Modified by Jitesh Jain (https://github.com/praeclarumjj3)
+# ------------------------------------------------------------------------------
+"""
+This file registers pre-defined datasets at hard-coded paths, and their metadata.
+We hard-code metadata for common datasets. This will enable:
+1. Consistency check when loading the datasets
+2. Use models on these standard datasets directly and run demos,
+   without having to download the dataset annotations
+We hard-code some paths to the dataset that's assumed to
+exist in "./datasets/".
+Users SHOULD NOT use this file to create new dataset / metadata for new dataset.
+To add new dataset, refer to the tutorial "docs/DATASETS.md".
+"""
+import os
+from detectron2.data.datasets.builtin_meta import  _get_builtin_metadata
+from detectron2.data.datasets.coco import register_coco_instances
+_PREDEFINED_SPLITS_COCO = {
+    "coco_2017_val_panoptic2instance": ("coco/val2017", "coco/annotations/panoptic2instances_val2017.json"),
+}
+def register_panoptic2instances_coco(root):
+    for key, (image_root, json_file) in _PREDEFINED_SPLITS_COCO.items():
+        # Assume pre-defined datasets live in `./datasets`.
+        register_coco_instances(
+            key,
+            _get_builtin_metadata("coco"),
+            os.path.join(root, json_file) if "://" not in json_file else json_file,
+            os.path.join(root, image_root),
+        )
+_root = os.path.expanduser(os.getenv("DETECTRON2_DATASETS", "datasets"))
+register_panoptic2instances_coco(_root)

oneformer/data/datasets/register_coco_panoptic_annos_semseg.py ADDED Viewed

	@@ -0,0 +1,367 @@

+# ------------------------------------------------------------------------------
+# Reference: https://github.com/facebookresearch/Mask2Former/blob/main/mask2former/data/datasets/register_coco_panoptic_annos_semseg.py
+# Modified by Jitesh Jain (https://github.com/praeclarumjj3)
+# ------------------------------------------------------------------------------
+import json
+import os
+from detectron2.data import DatasetCatalog, MetadataCatalog
+from detectron2.data.datasets import load_sem_seg
+from detectron2.data.datasets.builtin_meta import COCO_CATEGORIES
+from detectron2.utils.file_io import PathManager
+import contextlib
+import logging
+import io
+from fvcore.common.timer import Timer
+import pycocotools.mask as mask_util
+from detectron2.structures import BoxMode
+logger = logging.getLogger(__name__)
+_PREDEFINED_SPLITS_COCO_PANOPTIC = {
+    "coco_2017_train_panoptic": (
+        # This is the original panoptic annotation directory
+        "coco/panoptic_train2017",
+        "coco/annotations/panoptic_train2017.json",
+        # This directory contains semantic annotations that are
+        # converted from panoptic annotations.
+        # It is used by PanopticFPN.
+        # You can use the script at detectron2/datasets/prepare_panoptic_fpn.py
+        # to create these directories.
+        "coco/panoptic_semseg_train2017",
+    ),
+    "coco_2017_val_panoptic": (
+        "coco/panoptic_val2017",
+        "coco/annotations/panoptic_val2017.json",
+        "coco/panoptic_semseg_val2017",
+    ),
+}
+def load_coco_instance_json(json_file, image_root, dataset_name=None):
+    from pycocotools.coco import COCO
+    timer = Timer()
+    json_file = PathManager.get_local_path(json_file)
+    with contextlib.redirect_stdout(io.StringIO()):
+        coco_api = COCO(json_file)
+    if timer.seconds() > 1:
+        logger.info("Loading {} takes {:.2f} seconds.".format(json_file, timer.seconds()))
+    id_map = None
+    if dataset_name is not None:
+        meta = MetadataCatalog.get(dataset_name)
+        cat_ids = sorted(coco_api.getCatIds())
+        cats = coco_api.loadCats(cat_ids)
+        # The categories in a custom json file may not be sorted.
+        thing_classes = [c["name"] for c in sorted(cats, key=lambda x: x["id"])]
+        meta.thing_classes = thing_classes
+        # In COCO, certain category ids are artificially removed,
+        # and by convention they are always ignored.
+        # We deal with COCO's id issue and translate
+        # the category ids to contiguous ids in [0, 80).
+        # It works by looking at the "categories" field in the json, therefore
+        # if users' own json also have incontiguous ids, we'll
+        # apply this mapping as well but print a warning.
+        if not (min(cat_ids) == 1 and max(cat_ids) == len(cat_ids)):
+            if "coco" not in dataset_name:
+                logger.warning(
+                    """
+Category ids in annotations are not in [1, #categories]! We'll apply a mapping for you.
+"""
+                )
+        id_map = {v: i for i, v in enumerate(cat_ids)}
+        meta.thing_dataset_id_to_contiguous_id = id_map
+    # sort indices for reproducible results
+    img_ids = sorted(coco_api.imgs.keys())
+    # imgs is a list of dicts, each looks something like:
+    # {'license': 4,
+    #  'url': 'http://farm6.staticflickr.com/5454/9413846304_881d5e5c3b_z.jpg',
+    #  'file_name': 'COCO_val2014_000000001268.jpg',
+    #  'height': 427,
+    #  'width': 640,
+    #  'date_captured': '2013-11-17 05:57:24',
+    #  'id': 1268}
+    imgs = coco_api.loadImgs(img_ids)
+    # anns is a list[list[dict]], where each dict is an annotation
+    # record for an object. The inner list enumerates the objects in an image
+    # and the outer list enumerates over images. Example of anns[0]:
+    # [{'segmentation': [[192.81,
+    #     247.09,
+    #     ...
+    #     219.03,
+    #     249.06]],
+    #   'area': 1035.749,
+    #   'iscrowd': 0,
+    #   'image_id': 1268,
+    #   'bbox': [192.81, 224.8, 74.73, 33.43],
+    #   'category_id': 16,
+    #   'id': 42986},
+    #  ...]
+    anns = [coco_api.imgToAnns[img_id] for img_id in img_ids]
+    total_num_valid_anns = sum([len(x) for x in anns])
+    total_num_anns = len(coco_api.anns)
+    if total_num_valid_anns < total_num_anns:
+        logger.warning(
+            f"{json_file} contains {total_num_anns} annotations, but only "
+            f"{total_num_valid_anns} of them match to images in the file."
+        )
+    if "minival" not in json_file:
+        # The popular valminusminival & minival annotations for COCO2014 contain this bug.
+        # However the ratio of buggy annotations there is tiny and does not affect accuracy.
+        # Therefore we explicitly white-list them.
+        ann_ids = [ann["id"] for anns_per_image in anns for ann in anns_per_image]
+        assert len(set(ann_ids)) == len(ann_ids), "Annotation ids in '{}' are not unique!".format(
+            json_file
+        )
+    imgs_anns = list(zip(imgs, anns))
+    logger.info("Loaded {} images in COCO format from {}".format(len(imgs_anns), json_file))
+    dataset_dicts = {}
+    ann_keys = ["iscrowd", "bbox", "keypoints", "category_id"]
+    num_instances_without_valid_segmentation = 0
+    for (img_dict, anno_dict_list) in imgs_anns:
+        record = {}
+        record["file_name"] = os.path.join(image_root, img_dict["file_name"])
+        record["height"] = img_dict["height"]
+        record["width"] = img_dict["width"]
+        image_id = record["image_id"] = img_dict["id"]
+        objs = []
+        for anno in anno_dict_list:
+            # Check that the image_id in this annotation is the same as
+            # the image_id we're looking at.
+            # This fails only when the data parsing logic or the annotation file is buggy.
+            # The original COCO valminusminival2014 & minival2014 annotation files
+            # actually contains bugs that, together with certain ways of using COCO API,
+            # can trigger this assertion.
+            assert anno["image_id"] == image_id
+            assert anno.get("ignore", 0) == 0, '"ignore" in COCO json file is not supported.'
+            obj = {key: anno[key] for key in ann_keys if key in anno}
+            if "bbox" in obj and len(obj["bbox"]) == 0:
+                raise ValueError(
+                    f"One annotation of image {image_id} contains empty 'bbox' value! "
+                    "This json does not have valid COCO format."
+                )
+            segm = anno.get("segmentation", None)
+            if segm:  # either list[list[float]] or dict(RLE)
+                if isinstance(segm, dict):
+                    if isinstance(segm["counts"], list):
+                        # convert to compressed RLE
+                        segm = mask_util.frPyObjects(segm, *segm["size"])
+                else:
+                    # filter out invalid polygons (< 3 points)
+                    segm = [poly for poly in segm if len(poly) % 2 == 0 and len(poly) >= 6]
+                    if len(segm) == 0:
+                        num_instances_without_valid_segmentation += 1
+                        continue  # ignore this instance
+                obj["segmentation"] = segm
+            keypts = anno.get("keypoints", None)
+            if keypts:  # list[int]
+                for idx, v in enumerate(keypts):
+                    if idx % 3 != 2:
+                        # COCO's segmentation coordinates are floating points in [0, H or W],
+                        # but keypoint coordinates are integers in [0, H-1 or W-1]
+                        # Therefore we assume the coordinates are "pixel indices" and
+                        # add 0.5 to convert to floating point coordinates.
+                        keypts[idx] = v + 0.5
+                obj["keypoints"] = keypts
+            obj["bbox_mode"] = BoxMode.XYWH_ABS
+            if id_map:
+                annotation_category_id = obj["category_id"]
+                try:
+                    obj["category_id"] = id_map[annotation_category_id]
+                except KeyError as e:
+                    raise KeyError(
+                        f"Encountered category_id={annotation_category_id} "
+                        "but this id does not exist in 'categories' of the json file."
+                    ) from e
+            objs.append(obj)
+        record["annotations"] = objs
+        dataset_dicts[image_id] = record
+    if num_instances_without_valid_segmentation > 0:
+        logger.warning(
+            "Filtered out {} instances without valid segmentation. ".format(
+                num_instances_without_valid_segmentation
+            )
+            + "There might be issues in your dataset generation process.  Please "
+            "check https://detectron2.readthedocs.io/en/latest/tutorials/datasets.html carefully"
+        )
+    return dataset_dicts
+def get_metadata():
+    meta = {}
+    # The following metadata maps contiguous id from [0, #thing categories +
+    # #stuff categories) to their names and colors. We have to replica of the
+    # same name and color under "thing_*" and "stuff_*" because the current
+    # visualization function in D2 handles thing and class classes differently
+    # due to some heuristic used in Panoptic FPN. We keep the same naming to
+    # enable reusing existing visualization functions.
+    thing_classes = [k["name"] for k in COCO_CATEGORIES if k["isthing"] == 1]
+    thing_colors = [k["color"] for k in COCO_CATEGORIES if k["isthing"] == 1]
+    stuff_classes = [k["name"] for k in COCO_CATEGORIES]
+    stuff_colors = [k["color"] for k in COCO_CATEGORIES]
+    meta["thing_classes"] = thing_classes
+    meta["thing_colors"] = thing_colors
+    meta["stuff_classes"] = stuff_classes
+    meta["stuff_colors"] = stuff_colors
+    # Convert category id for training:
+    #   category id: like semantic segmentation, it is the class id for each
+    #   pixel. Since there are some classes not used in evaluation, the category
+    #   id is not always contiguous and thus we have two set of category ids:
+    #       - original category id: category id in the original dataset, mainly
+    #           used for evaluation.
+    #       - contiguous category id: [0, #classes), in order to train the linear
+    #           softmax classifier.
+    thing_dataset_id_to_contiguous_id = {}
+    stuff_dataset_id_to_contiguous_id = {}
+    for i, cat in enumerate(COCO_CATEGORIES):
+        if cat["isthing"]:
+            thing_dataset_id_to_contiguous_id[cat["id"]] = i
+        # else:
+        #     stuff_dataset_id_to_contiguous_id[cat["id"]] = i
+        # in order to use sem_seg evaluator
+        stuff_dataset_id_to_contiguous_id[cat["id"]] = i
+    meta["thing_dataset_id_to_contiguous_id"] = thing_dataset_id_to_contiguous_id
+    meta["stuff_dataset_id_to_contiguous_id"] = stuff_dataset_id_to_contiguous_id
+    return meta
+def load_coco_panoptic_json(json_file, instances_json, instances_name, image_dir, gt_dir, semseg_dir, meta):
+    """
+    Args:
+        image_dir (str): path to the raw dataset. e.g., "~/coco/train2017".
+        gt_dir (str): path to the raw annotations. e.g., "~/coco/panoptic_train2017".
+        json_file (str): path to the json file. e.g., "~/coco/annotations/panoptic_train2017.json".
+    Returns:
+        list[dict]: a list of dicts in Detectron2 standard format. (See
+        `Using Custom Datasets </tutorials/datasets.html>`_ )
+    """
+    def _convert_category_id(segment_info, meta):
+        if segment_info["category_id"] in meta["thing_dataset_id_to_contiguous_id"]:
+            segment_info["category_id"] = meta["thing_dataset_id_to_contiguous_id"][
+                segment_info["category_id"]
+            ]
+            segment_info["isthing"] = True
+        else:
+            segment_info["category_id"] = meta["stuff_dataset_id_to_contiguous_id"][
+                segment_info["category_id"]
+            ]
+            segment_info["isthing"] = False
+        return segment_info
+    with PathManager.open(json_file) as f:
+        json_info = json.load(f)
+    instance_data_dicts = load_coco_instance_json(instances_json, image_dir.replace("panoptic_", ""), instances_name)
+    ret = []
+    for ann in json_info["annotations"]:
+        image_id = int(ann["image_id"])
+        # TODO: currently we assume image and label has the same filename but
+        # different extension, and images have extension ".jpg" for COCO. Need
+        # to make image extension a user-provided argument if we extend this
+        # function to support other COCO-like datasets.
+        image_file = os.path.join(image_dir, os.path.splitext(ann["file_name"])[0] + ".jpg")
+        label_file = os.path.join(gt_dir, ann["file_name"])
+        sem_label_file = os.path.join(semseg_dir, ann["file_name"])
+        segments_info = [_convert_category_id(x, meta) for x in ann["segments_info"]]
+        ret.append(
+            {
+                "file_name": image_file,
+                "image_id": image_id,
+                "pan_seg_file_name": label_file,
+                "sem_seg_file_name": sem_label_file,
+                "segments_info": segments_info,
+                "annotations": instance_data_dicts[image_id]["annotations"],
+            }
+        )
+    assert len(ret), f"No images found in {image_dir}!"
+    assert PathManager.isfile(ret[0]["file_name"]), ret[0]["file_name"]
+    assert PathManager.isfile(ret[0]["pan_seg_file_name"]), ret[0]["pan_seg_file_name"]
+    assert PathManager.isfile(ret[0]["sem_seg_file_name"]), ret[0]["sem_seg_file_name"]
+    return ret
+def register_coco_panoptic_annos_sem_seg(
+    name, metadata, image_root, panoptic_root, panoptic_json, sem_seg_root, instances_json, instances_name,
+):
+    panoptic_name = name
+    delattr(MetadataCatalog.get(panoptic_name), "thing_classes")
+    delattr(MetadataCatalog.get(panoptic_name), "thing_colors")
+    MetadataCatalog.get(panoptic_name).set(
+        thing_classes=metadata["thing_classes"],
+        thing_colors=metadata["thing_colors"],
+        # thing_dataset_id_to_contiguous_id=metadata["thing_dataset_id_to_contiguous_id"],
+    )
+    # the name is "coco_2017_train_panoptic_with_sem_seg" and "coco_2017_val_panoptic_with_sem_seg"
+    semantic_name = name + "_with_sem_seg"
+    DatasetCatalog.register(
+        semantic_name,
+        lambda: load_coco_panoptic_json(panoptic_json, instances_json, instances_name, image_root, panoptic_root, sem_seg_root, metadata),
+    )
+    MetadataCatalog.get(semantic_name).set(
+        sem_seg_root=sem_seg_root,
+        panoptic_root=panoptic_root,
+        image_root=image_root,
+        panoptic_json=panoptic_json,
+        json_file=instances_json,
+        evaluator_type="coco_panoptic_seg",
+        ignore_label=255,
+        label_divisor=1000,
+        **metadata,
+    )
+def register_all_coco_panoptic_annos_sem_seg(root):
+    for (
+        prefix,
+        (panoptic_root, panoptic_json, semantic_root),
+    ) in _PREDEFINED_SPLITS_COCO_PANOPTIC.items():
+        prefix_instances = prefix[: -len("_panoptic")]
+        instances_meta = MetadataCatalog.get(prefix_instances)
+        image_root, instances_json = instances_meta.image_root, instances_meta.json_file
+        if 'val' in instances_json:
+            instances_json = instances_json.replace('instances_', 'panoptic2instances_')
+        register_coco_panoptic_annos_sem_seg(
+            prefix,
+            get_metadata(),
+            image_root,
+            os.path.join(root, panoptic_root),
+            os.path.join(root, panoptic_json),
+            os.path.join(root, semantic_root),
+            instances_json,
+            prefix_instances,
+        )
+_root = os.getenv("DETECTRON2_DATASETS", "datasets")
+register_all_coco_panoptic_annos_sem_seg(_root)

oneformer/data/tokenizer.py ADDED Viewed

	@@ -0,0 +1,200 @@

+# -------------------------------------------------------------------------
+# MIT License
+#
+# Copyright (c) 2021 OpenAI
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+# Modified by Jiarui Xu
+# -------------------------------------------------------------------------
+import gzip
+import html
+import os
+from functools import lru_cache
+import ftfy
+import regex as re
+import torch
+@lru_cache()
+def default_bpe():
+    return os.path.join(os.path.dirname(os.path.abspath(__file__)), 'bpe_simple_vocab_16e6.txt')
+@lru_cache()
+def bytes_to_unicode():
+    """Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
+    if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for decent
+    coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup tables
+    between utf-8 bytes and unicode strings. And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = list(range(ord('!'), ord('~') + 1)) + list(range(ord('¡'), ord('¬') + 1)) + list(range(ord('®'), ord('ÿ') + 1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8 + n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+def get_pairs(word):
+    """Return set of symbol pairs in a word.
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+def basic_clean(text):
+    text = ftfy.fix_text(text)
+    text = html.unescape(html.unescape(text))
+    return text.strip()
+def whitespace_clean(text):
+    text = re.sub(r'\s+', ' ', text)
+    text = text.strip()
+    return text
+class Tokenize:
+    def __init__(self, tokenizer, max_seq_len=77, truncate=True):
+        self.tokenizer = tokenizer
+        self.max_seq_len = max_seq_len
+        self.truncate = truncate
+    def __call__(self, texts):
+        expanded_dim = False
+        if isinstance(texts, str):
+            texts = [texts]
+            expanded_dim = True
+        sot_token = self.tokenizer.encoder['<|startoftext|>']
+        eot_token = self.tokenizer.encoder['<|endoftext|>']
+        all_tokens = [[sot_token] + self.tokenizer.encode(text) + [eot_token] for text in texts]
+        result = torch.zeros(len(all_tokens), self.max_seq_len, dtype=torch.long)
+        for i, tokens in enumerate(all_tokens):
+            if len(tokens) > self.max_seq_len:
+                if self.truncate:
+                    tokens = tokens[:self.max_seq_len]
+                    tokens[-1] = eot_token
+                else:
+                    raise RuntimeError(f'Input {texts[i]} is too long for context length {self.max_seq_len}')
+            result[i, :len(tokens)] = torch.tensor(tokens)
+        if expanded_dim:
+            return result[0]
+        return result
+class SimpleTokenizer(object):
+    def __init__(self, bpe_path: str = default_bpe()):
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        with open(bpe_path) as f:
+            contents = f.readlines()
+        merges = []
+        for cnt in contents:
+            merges.append(cnt.split('\n')[0])
+        merges.append("")
+        # merges = gzip.open(bpe_path).read().decode('utf-8').split('\n')
+        merges = merges[1:49152 - 256 - 2 + 1]
+        merges = [tuple(merge.split()) for merge in merges]
+        vocab = list(bytes_to_unicode().values())
+        vocab = vocab + [v + '</w>' for v in vocab]
+        for merge in merges:
+            vocab.append(''.join(merge))
+        vocab.extend(['<|startoftext|>', '<|endoftext|>'])
+        self.encoder = dict(zip(vocab, range(len(vocab))))
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.bpe_ranks = dict(zip(merges, range(len(merges))))
+        self.cache = {'<|startoftext|>': '<|startoftext|>', '<|endoftext|>': '<|endoftext|>'}
+        self.pat = re.compile(
+            r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""",
+            re.IGNORECASE)
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token[:-1]) + (token[-1] + '</w>', )
+        pairs = get_pairs(word)
+        if not pairs:
+            return token + '</w>'
+        while True:
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except:  # noqa: E722
+                    new_word.extend(word[i:])
+                    break
+                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = ' '.join(word)
+        self.cache[token] = word
+        return word
+    def encode(self, text):
+        bpe_tokens = []
+        text = whitespace_clean(basic_clean(text)).lower()
+        for token in re.findall(self.pat, text):
+            token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
+            bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
+        return bpe_tokens
+    def decode(self, tokens):
+        text = ''.join([self.decoder[token] for token in tokens])
+        text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors='replace').replace('</w>', ' ')
+        return text

oneformer/evaluation/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .detection_coco_evaluator import *
+from .coco_evaluator import *
+from .cityscapes_evaluation import CityscapesInstanceEvaluator

oneformer/evaluation/cityscapes_evaluation.py ADDED Viewed

	@@ -0,0 +1,201 @@

+# ------------------------------------------------------------------------------
+# Reference: https://github.com/facebookresearch/detectron2/blob/main/detectron2/evaluation/cityscapes_evaluation.py
+# Modified by Jitesh Jain (https://github.com/praeclarumjj3)
+# ------------------------------------------------------------------------------
+import glob
+import logging
+import numpy as np
+import os
+import tempfile
+from collections import OrderedDict
+import torch
+from PIL import Image
+from detectron2.data import MetadataCatalog
+from detectron2.utils import comm
+from detectron2.utils.file_io import PathManager
+from .evaluator import DatasetEvaluator
+class CityscapesEvaluator(DatasetEvaluator):
+    """
+    Base class for evaluation using cityscapes API.
+    """
+    def __init__(self, dataset_name):
+        """
+        Args:
+            dataset_name (str): the name of the dataset.
+                It must have the following metadata associated with it:
+                "thing_classes", "gt_dir".
+        """
+        self._metadata = MetadataCatalog.get(dataset_name)
+        self._cpu_device = torch.device("cpu")
+        self._logger = logging.getLogger(__name__)
+    def reset(self):
+        self._working_dir = tempfile.TemporaryDirectory(prefix="cityscapes_eval_")
+        self._temp_dir = self._working_dir.name
+        # All workers will write to the same results directory
+        # TODO this does not work in distributed training
+        assert (
+            comm.get_local_size() == comm.get_world_size()
+        ), "CityscapesEvaluator currently do not work with multiple machines."
+        self._temp_dir = comm.all_gather(self._temp_dir)[0]
+        if self._temp_dir != self._working_dir.name:
+            self._working_dir.cleanup()
+        self._logger.info(
+            "Writing cityscapes results to temporary directory {} ...".format(self._temp_dir)
+        )
+class CityscapesInstanceEvaluator(CityscapesEvaluator):
+    """
+    Evaluate instance segmentation results on cityscapes dataset using cityscapes API.
+    Note:
+        * It does not work in multi-machine distributed training.
+        * It contains a synchronization, therefore has to be used on all ranks.
+        * Only the main process runs evaluation.
+    """
+    def process(self, inputs, outputs):
+        from cityscapesscripts.helpers.labels import name2label
+        for input, output in zip(inputs, outputs):
+            file_name = input["file_name"]
+            basename = os.path.splitext(os.path.basename(file_name))[0]
+            pred_txt = os.path.join(self._temp_dir, basename + "_pred.txt")
+            if "instances" in output:
+                output = output["instances"].to(self._cpu_device)
+                num_instances = len(output)
+                with open(pred_txt, "w") as fout:
+                    for i in range(num_instances):
+                        pred_class = output.pred_classes[i]
+                        classes = self._metadata.stuff_classes[pred_class]
+                        class_id = name2label[classes].id
+                        score = output.scores[i]
+                        mask = output.pred_masks[i].numpy().astype("uint8")
+                        png_filename = os.path.join(
+                            self._temp_dir, basename + "_{}_{}.png".format(i, classes)
+                        )
+                        Image.fromarray(mask * 255).save(png_filename)
+                        fout.write(
+                            "{} {} {}\n".format(os.path.basename(png_filename), class_id, score)
+                        )
+            else:
+                # Cityscapes requires a prediction file for every ground truth image.
+                with open(pred_txt, "w") as fout:
+                    pass
+    def evaluate(self):
+        """
+        Returns:
+            dict: has a key "segm", whose value is a dict of "AP" and "AP50".
+        """
+        comm.synchronize()
+        if comm.get_rank() > 0:
+            return
+        import cityscapesscripts.evaluation.evalInstanceLevelSemanticLabeling as cityscapes_eval
+        self._logger.info("Evaluating results under {} ...".format(self._temp_dir))
+        # set some global states in cityscapes evaluation API, before evaluating
+        cityscapes_eval.args.predictionPath = os.path.abspath(self._temp_dir)
+        cityscapes_eval.args.predictionWalk = None
+        cityscapes_eval.args.JSONOutput = False
+        cityscapes_eval.args.colorized = False
+        cityscapes_eval.args.gtInstancesFile = os.path.join(self._temp_dir, "gtInstances.json")
+        # These lines are adopted from
+        # https://github.com/mcordts/cityscapesScripts/blob/master/cityscapesscripts/evaluation/evalInstanceLevelSemanticLabeling.py # noqa
+        gt_dir = PathManager.get_local_path(self._metadata.gt_dir)
+        groundTruthImgList = glob.glob(os.path.join(gt_dir, "*", "*_gtFine_instanceIds.png"))
+        assert len(
+            groundTruthImgList
+        ), "Cannot find any ground truth images to use for evaluation. Searched for: {}".format(
+            cityscapes_eval.args.groundTruthSearch
+        )
+        predictionImgList = []
+        for gt in groundTruthImgList:
+            predictionImgList.append(cityscapes_eval.getPrediction(gt, cityscapes_eval.args))
+        results = cityscapes_eval.evaluateImgLists(
+            predictionImgList, groundTruthImgList, cityscapes_eval.args
+        )["averages"]
+        ret = OrderedDict()
+        ret["segm"] = {"AP": results["allAp"] * 100, "AP50": results["allAp50%"] * 100}
+        self._working_dir.cleanup()
+        return ret
+class CityscapesSemSegEvaluator(CityscapesEvaluator):
+    """
+    Evaluate semantic segmentation results on cityscapes dataset using cityscapes API.
+    Note:
+        * It does not work in multi-machine distributed training.
+        * It contains a synchronization, therefore has to be used on all ranks.
+        * Only the main process runs evaluation.
+    """
+    def process(self, inputs, outputs):
+        from cityscapesscripts.helpers.labels import trainId2label
+        for input, output in zip(inputs, outputs):
+            file_name = input["file_name"]
+            basename = os.path.splitext(os.path.basename(file_name))[0]
+            pred_filename = os.path.join(self._temp_dir, basename + "_pred.png")
+            output = output["sem_seg"].argmax(dim=0).to(self._cpu_device).numpy()
+            pred = 255 * np.ones(output.shape, dtype=np.uint8)
+            for train_id, label in trainId2label.items():
+                if label.ignoreInEval:
+                    continue
+                pred[output == train_id] = label.id
+            Image.fromarray(pred).save(pred_filename)
+    def evaluate(self):
+        comm.synchronize()
+        if comm.get_rank() > 0:
+            return
+        # Load the Cityscapes eval script *after* setting the required env var,
+        # since the script reads CITYSCAPES_DATASET into global variables at load time.
+        import cityscapesscripts.evaluation.evalPixelLevelSemanticLabeling as cityscapes_eval
+        self._logger.info("Evaluating results under {} ...".format(self._temp_dir))
+        # set some global states in cityscapes evaluation API, before evaluating
+        cityscapes_eval.args.predictionPath = os.path.abspath(self._temp_dir)
+        cityscapes_eval.args.predictionWalk = None
+        cityscapes_eval.args.JSONOutput = False
+        cityscapes_eval.args.colorized = False
+        # These lines are adopted from
+        # https://github.com/mcordts/cityscapesScripts/blob/master/cityscapesscripts/evaluation/evalPixelLevelSemanticLabeling.py # noqa
+        gt_dir = PathManager.get_local_path(self._metadata.gt_dir)
+        groundTruthImgList = glob.glob(os.path.join(gt_dir, "*", "*_gtFine_labelIds.png"))
+        assert len(
+            groundTruthImgList
+        ), "Cannot find any ground truth images to use for evaluation. Searched for: {}".format(
+            cityscapes_eval.args.groundTruthSearch
+        )
+        predictionImgList = []
+        for gt in groundTruthImgList:
+            predictionImgList.append(cityscapes_eval.getPrediction(cityscapes_eval.args, gt))
+        results = cityscapes_eval.evaluateImgLists(
+            predictionImgList, groundTruthImgList, cityscapes_eval.args
+        )
+        ret = OrderedDict()
+        ret["sem_seg"] = {
+            "IoU": 100.0 * results["averageScoreClasses"],
+            "iIoU": 100.0 * results["averageScoreInstClasses"],
+            "IoU_sup": 100.0 * results["averageScoreCategories"],
+            "iIoU_sup": 100.0 * results["averageScoreInstCategories"],
+        }
+        self._working_dir.cleanup()
+        return ret