ddoc commited on Jul 2, 2023

Commit

9855482

1 Parent(s): 6ca9f66

Upload 921 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.github/ISSUE_TEMPLATE/bug_report.yml +91 -0
.github/ISSUE_TEMPLATE/config.yml +1 -0
.github/workflows/tests.yml +37 -0
.gitignore +171 -0
LICENSE +21 -0
README.md +247 -0
annotator/annotator_path.py +22 -0
annotator/binary/__init__.py +14 -0
annotator/canny/__init__.py +5 -0
annotator/clip/__init__.py +39 -0
annotator/clip_vision/config.json +171 -0
annotator/clip_vision/merges.txt +0 -0
annotator/clip_vision/preprocessor_config.json +19 -0
annotator/clip_vision/tokenizer.json +0 -0
annotator/clip_vision/tokenizer_config.json +34 -0
annotator/clip_vision/vocab.json +0 -0
annotator/color/__init__.py +20 -0
annotator/hed/__init__.py +98 -0
annotator/keypose/__init__.py +212 -0
annotator/keypose/faster_rcnn_r50_fpn_coco.py +182 -0
annotator/keypose/hrnet_w48_coco_256x192.py +169 -0
annotator/lama/__init__.py +58 -0
annotator/lama/config.yaml +157 -0
annotator/lama/saicinpainting/__init__.py +0 -0
annotator/lama/saicinpainting/training/__init__.py +0 -0
annotator/lama/saicinpainting/training/data/__init__.py +0 -0
annotator/lama/saicinpainting/training/data/masks.py +332 -0
annotator/lama/saicinpainting/training/losses/__init__.py +0 -0
annotator/lama/saicinpainting/training/losses/adversarial.py +177 -0
annotator/lama/saicinpainting/training/losses/constants.py +152 -0
annotator/lama/saicinpainting/training/losses/distance_weighting.py +126 -0
annotator/lama/saicinpainting/training/losses/feature_matching.py +33 -0
annotator/lama/saicinpainting/training/losses/perceptual.py +113 -0
annotator/lama/saicinpainting/training/losses/segmentation.py +43 -0
annotator/lama/saicinpainting/training/losses/style_loss.py +155 -0
annotator/lama/saicinpainting/training/modules/__init__.py +31 -0
annotator/lama/saicinpainting/training/modules/base.py +80 -0
annotator/lama/saicinpainting/training/modules/depthwise_sep_conv.py +17 -0
annotator/lama/saicinpainting/training/modules/fake_fakes.py +47 -0
annotator/lama/saicinpainting/training/modules/ffc.py +485 -0
annotator/lama/saicinpainting/training/modules/multidilated_conv.py +98 -0
annotator/lama/saicinpainting/training/modules/multiscale.py +244 -0
annotator/lama/saicinpainting/training/modules/pix2pixhd.py +669 -0
annotator/lama/saicinpainting/training/modules/spatial_transform.py +49 -0
annotator/lama/saicinpainting/training/modules/squeeze_excitation.py +20 -0
annotator/lama/saicinpainting/training/trainers/__init__.py +29 -0
annotator/lama/saicinpainting/training/trainers/base.py +293 -0
annotator/lama/saicinpainting/training/trainers/default.py +175 -0
annotator/lama/saicinpainting/training/visualizers/__init__.py +15 -0
annotator/lama/saicinpainting/training/visualizers/base.py +73 -0

.github/ISSUE_TEMPLATE/bug_report.yml ADDED Viewed

	@@ -0,0 +1,91 @@

+name: Bug Report
+description: Create a report
+title: "[Bug]: "
+labels: ["bug-report"]
+body:
+  - type: checkboxes
+    attributes:
+      label: Is there an existing issue for this?
+      description: Please search to see if an issue already exists for the bug you encountered, and that it hasn't been fixed in a recent build/commit.
+      options:
+        - label: I have searched the existing issues and checked the recent builds/commits of both this extension and the webui
+          required: true
+  - type: markdown
+    attributes:
+      value: |
+        *Please fill this form with as much information as possible, don't forget to fill "What OS..." and "What browsers" and *provide screenshots if possible**
+  - type: textarea
+    id: what-did
+    attributes:
+      label: What happened?
+      description: Tell us what happened in a very clear and simple way
+    validations:
+      required: true
+  - type: textarea
+    id: steps
+    attributes:
+      label: Steps to reproduce the problem
+      description: Please provide us with precise step by step information on how to reproduce the bug
+      value: |
+        1. Go to ....
+        2. Press ....
+        3. ...
+    validations:
+      required: true
+  - type: textarea
+    id: what-should
+    attributes:
+      label: What should have happened?
+      description: Tell what you think the normal behavior should be
+    validations:
+      required: true
+  - type: textarea
+    id: commits
+    attributes:
+      label: Commit where the problem happens
+      description: Which commit of the extension are you running on? Please include the commit of both the extension and the webui (Do not write *Latest version/repo/commit*, as this means nothing and will have changed by the time we read your issue. Rather, copy the **Commit** link at the bottom of the UI, or from the cmd/terminal if you can't launch it.)
+      value: |
+        webui:
+        controlnet:
+    validations:
+      required: true
+  - type: dropdown
+    id: browsers
+    attributes:
+      label: What browsers do you use to access the UI ?
+      multiple: true
+      options:
+        - Mozilla Firefox
+        - Google Chrome
+        - Brave
+        - Apple Safari
+        - Microsoft Edge
+  - type: textarea
+    id: cmdargs
+    attributes:
+      label: Command Line Arguments
+      description: Are you using any launching parameters/command line arguments (modified webui-user .bat/.sh) ? If yes, please write them below. Write "No" otherwise.
+      render: Shell
+    validations:
+      required: true
+  - type: textarea
+    id: extensions
+    attributes:
+      label: List of enabled extensions
+      description: Please provide a full list of enabled extensions or screenshots of your "Extensions" tab.
+    validations:
+      required: true
+  - type: textarea
+    id: logs
+    attributes:
+      label: Console logs
+      description: Please provide full cmd/terminal logs from the moment you started UI to the end of it, after your bug happened. If it's very long, provide a link to pastebin or similar service.
+      render: Shell
+    validations:
+      required: true
+  - type: textarea
+    id: misc
+    attributes:
+      label: Additional information
+      description: Please provide us with any relevant additional info or context.

.github/ISSUE_TEMPLATE/config.yml ADDED Viewed

	@@ -0,0 +1 @@


1	+ blank_issues_enabled: true

.github/workflows/tests.yml ADDED Viewed

	@@ -0,0 +1,37 @@

+name: Run basic features tests on CPU
+on:
+  - push
+  - pull_request
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout Code
+        uses: actions/checkout@v3
+        with:
+          repository: 'AUTOMATIC1111/stable-diffusion-webui'
+          path: 'stable-diffusion-webui'
+          ref: '5ab7f213bec2f816f9c5644becb32eb72c8ffb89'
+      - name: Checkout Code
+        uses: actions/checkout@v3
+        with:
+          repository: 'Mikubill/sd-webui-controlnet'
+          path: 'stable-diffusion-webui/extensions/sd-webui-controlnet'
+      - name: Set up Python 3.10
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.10.6
+          cache: pip
+          cache-dependency-path: |
+            **/requirements*txt
+            stable-diffusion-webui/requirements*txt
+      - run: |
+          pip install torch torchvision
+          curl -Lo stable-diffusion-webui/extensions/sd-webui-controlnet/models/control_canny-fp16.safetensors https://huggingface.co/webui/ControlNet-modules-safetensors/resolve/main/control_canny-fp16.safetensors
+          cd stable-diffusion-webui && python launch.py --no-half --disable-opt-split-attention --use-cpu all --skip-torch-cuda-test --api --tests ./extensions/sd-webui-controlnet/tests
+          rm -fr stable-diffusion-webui/extensions/sd-webui-controlnet/models/control_canny-fp16.safetensors

.gitignore ADDED Viewed

	@@ -0,0 +1,171 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea
+*.pt
+*.pth
+*.ckpt
+*.bin
+*.safetensors
+# Editor setting metadata
+.idea/
+.vscode/
+detected_maps/
+annotator/downloads/

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2023 Kakigōri Maker
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md ADDED Viewed

	@@ -0,0 +1,247 @@

+# ControlNet for Stable Diffusion WebUI
+The WebUI extension for ControlNet and other injection-based SD controls.
+![image](https://user-images.githubusercontent.com/20929282/246632890-400b2e0d-b064-4505-b31d-49375216ca98.png)
+This extension is for AUTOMATIC1111's [Stable Diffusion web UI](https://github.com/AUTOMATIC1111/stable-diffusion-webui), allows the Web UI to add [ControlNet](https://github.com/lllyasviel/ControlNet) to the original Stable Diffusion model to generate images. The addition is on-the-fly, the merging is not required.
+# Installation
+1. Open "Extensions" tab.
+2. Open "Install from URL" tab in the tab.
+3. Enter `https://github.com/Mikubill/sd-webui-controlnet.git` to "URL for extension's git repository".
+4. Press "Install" button.
+5. Wait for 5 seconds, and you will see the message "Installed into stable-diffusion-webui\extensions\sd-webui-controlnet. Use Installed tab to restart".
+6. Go to "Installed" tab, click "Check for updates", and then click "Apply and restart UI". (The next time you can also use these buttons to update ControlNet.)
+7. Completely restart A1111 webui including your terminal. (If you do not know what is a "terminal", you can reboot your computer to achieve the same effect.)
+8. Download models (see below).
+9. After you put models in the correct folder, you may need to refresh to see the models. The refresh button is right to your "Model" dropdown.
+**Update from ControlNet 1.0 to 1.1:**
+* If you are not sure, you can back up and remove the folder "stable-diffusion-webui\extensions\sd-webui-controlnet", and then start from the step 1 in the above Installation section.
+* Or you can start from the step 6 in the above Install section.
+# Download Models
+Right now all the 14 models of ControlNet 1.1 are in the beta test.
+Download the models from ControlNet 1.1: https://huggingface.co/lllyasviel/ControlNet-v1-1/tree/main
+You need to download model files ending with ".pth" .
+Put models in your "stable-diffusion-webui\extensions\sd-webui-controlnet\models". Now we have already included all "yaml" files. You only need to download "pth" files.
+Do not right-click the filenames in HuggingFace website to download. Some users right-clicked those HuggingFace HTML websites and saved those HTML pages as PTH/YAML files. They are not downloading correct files. Instead, please click the small download arrow “↓” icon in HuggingFace to download.
+Note: If you download models elsewhere, please make sure that yaml file names and model files names are same. Please manually rename all yaml files if you download from other sources. (Some models like "shuffle" needs the yaml file so that we know the outputs of ControlNet should pass a global average pooling before injecting to SD U-Nets.)
+# New Features in ControlNet 1.1
+### Perfect Support for All ControlNet 1.0/1.1 and T2I Adapter Models.
+Now we have perfect support all available models and preprocessors, including perfect support for T2I style adapter and ControlNet 1.1 Shuffle. (Make sure that your YAML file names and model file names are same, see also YAML files in "stable-diffusion-webui\extensions\sd-webui-controlnet\models".)
+### Perfect Support for A1111 High-Res. Fix
+Now if you turn on High-Res Fix in A1111, each controlnet will output two different control images: a small one and a large one. The small one is for your basic generating, and the big one is for your High-Res Fix generating. The two control images are computed by a smart algorithm called "super high-quality control image resampling". This is turned on by default, and you do not need to change any setting.
+### Perfect Support for All A1111 Img2Img or Inpaint Settings and All Mask Types
+Now ControlNet is extensively tested with A1111's different types of masks, including "Inpaint masked"/"Inpaint not masked", and "Whole picture"/"Only masked", and "Only masked padding"&"Mask blur". The resizing perfectly matches A1111's "Just resize"/"Crop and resize"/"Resize and fill". This means you can use ControlNet in nearly everywhere in your A1111 UI without difficulty!
+### The New "Pixel-Perfect" Mode
+Now if you turn on pixel-perfect mode, you do not need to set preprocessor (annotator) resolutions manually. The ControlNet will automatically compute the best annotator resolution for you so that each pixel perfectly matches Stable Diffusion.
+### User-Friendly GUI and Preprocessor Preview
+We reorganized some previously confusing UI like "canvas width/height for new canvas" and it is in the 📝 button now. Now the preview GUI is controlled by the "allow preview" option and the trigger button 💥. The preview image size is better than before, and you do not need to scroll up and down - your a1111 GUI will not be messed up anymore!
+### Support for Almost All Upscaling Scripts
+Now ControlNet 1.1 can support almost all Upscaling/Tile methods. ControlNet 1.1 support the script "Ultimate SD upscale" and almost all other tile-based extensions. Please do not confuse ["Ultimate SD upscale"](https://github.com/Coyote-A/ultimate-upscale-for-automatic1111) with "SD upscale" - they are different scripts. Note that the most recommended upscaling method is ["Tiled VAE/Diffusion"](https://github.com/pkuliyi2015/multidiffusion-upscaler-for-automatic1111) but we test as many methods/extensions as possible. Note that "SD upscale" is supported since 1.1.117, and if you use it, you need to leave all ControlNet images as blank (We do not recommend "SD upscale" since it is somewhat buggy and cannot be maintained - use the "Ultimate SD upscale" instead).
+### More Control Modes (previously called Guess Mode)
+We have fixed many bugs in previous 1.0’s Guess Mode and now it is called Control Mode
+![image](https://user-images.githubusercontent.com/19834515/236641759-6c44ddf6-c7ad-4bda-92be-e90a52911d75.png)
+Now you can control which aspect is more important (your prompt or your ControlNet)：
+* "Balanced": ControlNet on both sides of CFG scale, same as turning off "Guess Mode" in ControlNet 1.0
+* "My prompt is more important": ControlNet on both sides of CFG scale, with progressively reduced SD U-Net injections (layer_weight*=0.825**I, where 0<=I <13, and the 13 means ControlNet injected SD 13 times). In this way, you can make sure that your prompts are perfectly displayed in your generated images.
+* "ControlNet is more important": ControlNet only on the Conditional Side of CFG scale (the cond in A1111's batch-cond-uncond). This means the ControlNet will be X times stronger if your cfg-scale is X. For example, if your cfg-scale is 7, then ControlNet is 7 times stronger. Note that here the X times stronger is different from "Control Weights" since your weights are not modified. This "stronger" effect usually has less artifact and give ControlNet more room to guess what is missing from your prompts (and in the previous 1.0, it is called "Guess Mode").
+<table width="100%">
+<tr>
+<td width="25%" style="text-align: center">Input (depth+canny+hed)</td>
+<td width="25%" style="text-align: center">"Balanced"</td>
+<td width="25%" style="text-align: center">"My prompt is more important"</td>
+<td width="25%" style="text-align: center">"ControlNet is more important"</td>
+</tr>
+<tr>
+<td width="25%" style="text-align: center"><img src="samples/cm1.png"></td>
+<td width="25%" style="text-align: center"><img src="samples/cm2.png"></td>
+<td width="25%" style="text-align: center"><img src="samples/cm3.png"></td>
+<td width="25%" style="text-align: center"><img src="samples/cm4.png"></td>
+</tr>
+</table>
+### Reference-Only Control
+Now we have a `reference-only` preprocessor that does not require any control models. It can guide the diffusion directly using images as references.
+(Prompt "a dog running on grassland, best quality, ...")
+![image](samples/ref.png)
+This method is similar to inpaint-based reference but it does not make your image disordered.
+Many professional A1111 users know a trick to diffuse image with references by inpaint. For example, if you have a 512x512 image of a dog, and want to generate another 512x512 image with the same dog, some users will connect the 512x512 dog image and a 512x512 blank image into a 1024x512 image, send to inpaint, and mask out the blank 512x512 part to diffuse a dog with similar appearance. However, that method is usually not very satisfying since images are connected and many distortions will appear.
+This `reference-only` ControlNet can directly link the attention layers of your SD to any independent images, so that your SD will read arbitary images for reference. You need at least ControlNet 1.1.153 to use it.
+To use, just select `reference-only` as preprocessor and put an image. Your SD will just use the image as reference.
+*Note that this method is as "non-opinioned" as possible. It only contains very basic connection codes, without any personal preferences, to connect the attention layers with your reference images. However, even if we tried best to not include any opinioned codes, we still need to write some subjective implementations to deal with weighting, cfg-scale, etc - tech report is on the way.*
+More examples [here](https://github.com/Mikubill/sd-webui-controlnet/discussions/1236).
+# Technical Documents
+See also the documents of ControlNet 1.1:
+https://github.com/lllyasviel/ControlNet-v1-1-nightly#model-specification
+# Default Setting
+This is my setting. If you run into any problem, you can use this setting as a sanity check
+![image](https://user-images.githubusercontent.com/19834515/235620638-17937171-8ac1-45bc-a3cb-3aebf605b4ef.png)
+# Use Previous Models
+### Use ControlNet 1.0 Models
+https://huggingface.co/lllyasviel/ControlNet/tree/main/models
+You can still use all previous models in the previous ControlNet 1.0. Now, the previous "depth" is now called "depth_midas", the previous "normal" is called "normal_midas", the previous "hed" is called "softedge_hed". And starting from 1.1, all line maps, edge maps, lineart maps, boundary maps will have black background and white lines.
+### Use T2I-Adapter Models
+(From TencentARC/T2I-Adapter)
+To use T2I-Adapter models:
+1. Download files from https://huggingface.co/TencentARC/T2I-Adapter/tree/main/models
+2. Put them in "stable-diffusion-webui\extensions\sd-webui-controlnet\models".
+3. Make sure that the file names of pth files and yaml files are consistent.
+*Note that "CoAdapter" is not implemented yet.*
+# Gallery
+The below results are from ControlNet 1.0.
+| Source | Input | Output |
+|:-------------------------:|:-------------------------:|:-------------------------:|
+| (no preprocessor) |  <img width="256" alt="" src="https://github.com/Mikubill/sd-webui-controlnet/blob/main/samples/bal-source.png?raw=true"> | <img width="256" alt="" src="https://github.com/Mikubill/sd-webui-controlnet/blob/main/samples/bal-gen.png?raw=true"> |
+| (no preprocessor) |  <img width="256" alt="" src="https://github.com/Mikubill/sd-webui-controlnet/blob/main/samples/dog_rel.jpg?raw=true"> | <img width="256" alt="" src="https://github.com/Mikubill/sd-webui-controlnet/blob/main/samples/dog_rel.png?raw=true"> |
+|<img width="256" alt="" src="https://github.com/Mikubill/sd-webui-controlnet/blob/main/samples/mahiro_input.png?raw=true">  |  <img width="256" alt="" src="https://github.com/Mikubill/sd-webui-controlnet/blob/main/samples/mahiro_canny.png?raw=true"> | <img width="256" alt="" src="https://github.com/Mikubill/sd-webui-controlnet/blob/main/samples/mahiro-out.png?raw=true"> |
+|<img width="256" alt="" src="https://github.com/Mikubill/sd-webui-controlnet/blob/main/samples/evt_source.jpg?raw=true">  |  <img width="256" alt="" src="https://github.com/Mikubill/sd-webui-controlnet/blob/main/samples/evt_hed.png?raw=true"> | <img width="256" alt="" src="https://github.com/Mikubill/sd-webui-controlnet/blob/main/samples/evt_gen.png?raw=true"> |
+|<img width="256" alt="" src="https://github.com/Mikubill/sd-webui-controlnet/blob/main/samples/an-source.jpg?raw=true">  |  <img width="256" alt="" src="https://github.com/Mikubill/sd-webui-controlnet/blob/main/samples/an-pose.png?raw=true"> | <img width="256" alt="" src="https://github.com/Mikubill/sd-webui-controlnet/blob/main/samples/an-gen.png?raw=true"> |
+|<img width="256" alt="" src="https://github.com/Mikubill/sd-webui-controlnet/blob/main/samples/sk-b-src.png?raw=true">  |  <img width="256" alt="" src="https://github.com/Mikubill/sd-webui-controlnet/blob/main/samples/sk-b-dep.png?raw=true"> | <img width="256" alt="" src="https://github.com/Mikubill/sd-webui-controlnet/blob/main/samples/sk-b-out.png?raw=true"> |
+The below examples are from T2I-Adapter.
+From `t2iadapter_color_sd14v1.pth` :
+| Source | Input | Output |
+|:-------------------------:|:-------------------------:|:-------------------------:|
+| <img width="256" alt="" src="https://user-images.githubusercontent.com/31246794/222947416-ec9e52a4-a1d0-48d8-bb81-736bf636145e.jpeg"> | <img width="256" alt="" src="https://user-images.githubusercontent.com/31246794/222947435-1164e7d8-d857-42f9-ab10-2d4a4b25f33a.png"> | <img width="256" alt="" src="https://user-images.githubusercontent.com/31246794/222947557-5520d5f8-88b4-474d-a576-5c9cd3acac3a.png"> |
+From `t2iadapter_style_sd14v1.pth` :
+| Source | Input | Output |
+|:-------------------------:|:-------------------------:|:-------------------------:|
+| <img width="256" alt="" src="https://user-images.githubusercontent.com/31246794/222947416-ec9e52a4-a1d0-48d8-bb81-736bf636145e.jpeg"> | (clip, non-image) | <img width="256" alt="" src="https://user-images.githubusercontent.com/31246794/222965711-7b884c9e-7095-45cb-a91c-e50d296ba3a2.png"> |
+# Minimum Requirements
+* (Windows) (NVIDIA: Ampere) 4gb - with `--xformers` enabled, and `Low VRAM` mode ticked in the UI, goes up to 768x832
+# Multi-ControlNet
+This option allows multiple ControlNet inputs for a single generation. To enable this option, change `Multi ControlNet: Max models amount (requires restart)` in the settings. Note that you will need to restart the WebUI for changes to take effect.
+<table width="100%">
+<tr>
+<td width="25%" style="text-align: center">Source A</td>
+<td width="25%" style="text-align: center">Source B</td>
+<td width="25%" style="text-align: center">Output</td>
+</tr>
+<tr>
+<td width="25%" style="text-align: center"><img src="https://user-images.githubusercontent.com/31246794/220448620-cd3ede92-8d3f-43d5-b771-32dd8417618f.png"></td>
+<td width="25%" style="text-align: center"><img src="https://user-images.githubusercontent.com/31246794/220448619-beed9bdb-f6bb-41c2-a7df-aa3ef1f653c5.png"></td>
+<td width="25%" style="text-align: center"><img src="https://user-images.githubusercontent.com/31246794/220448613-c99a9e04-0450-40fd-bc73-a9122cefaa2c.png"></td>
+</tr>
+</table>
+# Control Weight/Start/End
+Weight is the weight of the controlnet "influence". It's analogous to prompt attention/emphasis. E.g. (myprompt: 1.2). Technically, it's the factor by which to multiply the ControlNet outputs before merging them with original SD Unet.
+Guidance Start/End is the percentage of total steps the controlnet applies (guidance strength = guidance end). It's analogous to prompt editing/shifting. E.g. \[myprompt::0.8\] (It applies from the beginning until 80% of total steps)
+# Batch Mode
+Put any unit into batch mode to activate batch mode for all units. Specify a batch directory for each unit, or use the new textbox in the img2img batch tab as a fallback. Although the textbox is located in the img2img batch tab, you can use it to generate images in the txt2img tab as well.
+Note that this feature is only available in the gradio user interface. Call the APIs as many times as you want for custom batch scheduling.
+# API and Script Access
+This extension can accept txt2img or img2img tasks via API or external extension call. Note that you may need to enable `Allow other scripts to control this extension` in settings for external calls.
+To use the API: start WebUI with argument `--api` and go to `http://webui-address/docs` for documents or checkout [examples](https://github.com/Mikubill/sd-webui-controlnet/blob/main/example/api_txt2img.ipynb).
+To use external call: Checkout [Wiki](https://github.com/Mikubill/sd-webui-controlnet/wiki/API)
+# Command Line Arguments
+This extension adds these command line arguments to the webui:
+```
+    --controlnet-dir <path to directory with controlnet models>                                ADD a controlnet models directory
+    --controlnet-annotator-models-path <path to directory with annotator model directories>    SET the directory for annotator models
+    --no-half-controlnet                                                                       load controlnet models in full precision
+    --controlnet-preprocessor-cache-size                                                       Cache size for controlnet preprocessor results
+    --controlnet-loglevel                                                                      Log level for the controlnet extension
+```
+# MacOS Support
+Tested with pytorch nightly: https://github.com/Mikubill/sd-webui-controlnet/pull/143#issuecomment-1435058285
+To use this extension with mps and normal pytorch, currently you may need to start WebUI with `--no-half`.
+# Archive of Deprecated Versions
+The previous version (sd-webui-controlnet 1.0) is archived in
+https://github.com/lllyasviel/webui-controlnet-v1-archived
+Using this version is not a temporary stop of updates. You will stop all updates forever.
+Please consider this version if you work with professional studios that requires 100% reproducing of all previous results pixel by pixel.
+# Thanks
+This implementation is inspired by kohya-ss/sd-webui-additional-networks

annotator/annotator_path.py ADDED Viewed

	@@ -0,0 +1,22 @@

+import os
+from modules import shared
+models_path = shared.opts.data.get('control_net_modules_path', None)
+if not models_path:
+    models_path = getattr(shared.cmd_opts, 'controlnet_annotator_models_path', None)
+if not models_path:
+    models_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'downloads')
+if not os.path.isabs(models_path):
+    models_path = os.path.join(shared.data_path, models_path)
+clip_vision_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'clip_vision')
+# clip vision is always inside controlnet "extensions\sd-webui-controlnet"
+# and any problem can be solved by removing controlnet and reinstall
+models_path = os.path.realpath(models_path)
+os.makedirs(models_path, exist_ok=True)
+print(f'ControlNet preprocessor location: {models_path}')
+# Make sure that the default location is inside controlnet "extensions\sd-webui-controlnet"
+# so that any problem can be solved by removing controlnet and reinstall
+# if users do not change configs on their own (otherwise users will know what is wrong)

annotator/binary/__init__.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import cv2
+def apply_binary(img, bin_threshold):
+    img_gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
+    if bin_threshold == 0 or bin_threshold == 255:
+        # Otsu's threshold
+        otsu_threshold, img_bin = cv2.threshold(img_gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
+        print("Otsu threshold:", otsu_threshold)
+    else:
+        _, img_bin = cv2.threshold(img_gray, bin_threshold, 255, cv2.THRESH_BINARY_INV)
+    return cv2.cvtColor(img_bin, cv2.COLOR_GRAY2RGB)

annotator/canny/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+import cv2
+def apply_canny(img, low_threshold, high_threshold):
+    return cv2.Canny(img, low_threshold, high_threshold)

annotator/clip/__init__.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import torch
+from transformers import CLIPProcessor, CLIPVisionModel
+from modules import devices
+import os
+from annotator.annotator_path import clip_vision_path
+remote_model_path = "https://huggingface.co/openai/clip-vit-large-patch14/resolve/main/pytorch_model.bin"
+clip_path = clip_vision_path
+print(f'ControlNet ClipVision location: {clip_path}')
+clip_proc = None
+clip_vision_model = None
+def apply_clip(img):
+    global clip_proc, clip_vision_model
+    if clip_vision_model is None:
+        modelpath = os.path.join(clip_path, 'pytorch_model.bin')
+        if not os.path.exists(modelpath):
+            from basicsr.utils.download_util import load_file_from_url
+            load_file_from_url(remote_model_path, model_dir=clip_path)
+        clip_proc = CLIPProcessor.from_pretrained(clip_path)
+        clip_vision_model = CLIPVisionModel.from_pretrained(clip_path)
+    with torch.no_grad():
+        clip_vision_model = clip_vision_model.to(devices.get_device_for("controlnet"))
+        style_for_clip = clip_proc(images=img, return_tensors="pt")['pixel_values']
+        style_feat = clip_vision_model(style_for_clip.to(devices.get_device_for("controlnet")))['last_hidden_state']
+    return style_feat
+def unload_clip_model():
+    global clip_proc, clip_vision_model
+    if clip_vision_model is not None:
+        clip_vision_model.cpu()

annotator/clip_vision/config.json ADDED Viewed

	@@ -0,0 +1,171 @@

+{
+  "_name_or_path": "clip-vit-large-patch14/",
+  "architectures": [
+    "CLIPModel"
+  ],
+  "initializer_factor": 1.0,
+  "logit_scale_init_value": 2.6592,
+  "model_type": "clip",
+  "projection_dim": 768,
+  "text_config": {
+    "_name_or_path": "",
+    "add_cross_attention": false,
+    "architectures": null,
+    "attention_dropout": 0.0,
+    "bad_words_ids": null,
+    "bos_token_id": 0,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "dropout": 0.0,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": 2,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "hidden_act": "quick_gelu",
+    "hidden_size": 768,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "initializer_factor": 1.0,
+    "initializer_range": 0.02,
+    "intermediate_size": 3072,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "layer_norm_eps": 1e-05,
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "max_position_embeddings": 77,
+    "min_length": 0,
+    "model_type": "clip_text_model",
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": 12,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_hidden_layers": 12,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": 1,
+    "prefix": null,
+    "problem_type": null,
+    "projection_dim" : 768,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "transformers_version": "4.16.0.dev0",
+    "use_bfloat16": false,
+    "vocab_size": 49408
+  },
+  "text_config_dict": {
+    "hidden_size": 768,
+    "intermediate_size": 3072,
+    "num_attention_heads": 12,
+    "num_hidden_layers": 12,
+    "projection_dim": 768
+  },
+  "torch_dtype": "float32",
+  "transformers_version": null,
+  "vision_config": {
+    "_name_or_path": "",
+    "add_cross_attention": false,
+    "architectures": null,
+    "attention_dropout": 0.0,
+    "bad_words_ids": null,
+    "bos_token_id": null,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "dropout": 0.0,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "hidden_act": "quick_gelu",
+    "hidden_size": 1024,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "image_size": 224,
+    "initializer_factor": 1.0,
+    "initializer_range": 0.02,
+    "intermediate_size": 4096,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "layer_norm_eps": 1e-05,
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "min_length": 0,
+    "model_type": "clip_vision_model",
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": 16,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_hidden_layers": 24,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "patch_size": 14,
+    "prefix": null,
+    "problem_type": null,
+    "projection_dim" : 768,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "transformers_version": "4.16.0.dev0",
+    "use_bfloat16": false
+  },
+  "vision_config_dict": {
+    "hidden_size": 1024,
+    "intermediate_size": 4096,
+    "num_attention_heads": 16,
+    "num_hidden_layers": 24,
+    "patch_size": 14,
+    "projection_dim": 768
+  }
+}

annotator/clip_vision/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

annotator/clip_vision/preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "crop_size": 224,
+  "do_center_crop": true,
+  "do_normalize": true,
+  "do_resize": true,
+  "feature_extractor_type": "CLIPFeatureExtractor",
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "resample": 3,
+  "size": 224
+}

annotator/clip_vision/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

annotator/clip_vision/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,34 @@

+{
+    "unk_token": {
+        "content": "<|endoftext|>",
+        "single_word": false,
+        "lstrip": false,
+        "rstrip": false,
+        "normalized": true,
+        "__type": "AddedToken"
+    },
+    "bos_token": {
+        "content": "<|startoftext|>",
+        "single_word": false,
+        "lstrip": false,
+        "rstrip": false,
+        "normalized": true,
+        "__type": "AddedToken"
+    },
+    "eos_token": {
+        "content": "<|endoftext|>",
+        "single_word": false,
+        "lstrip": false,
+        "rstrip": false,
+        "normalized": true,
+        "__type": "AddedToken"
+    },
+    "pad_token": "<|endoftext|>",
+    "add_prefix_space": false,
+    "errors": "replace",
+    "do_lower_case": true,
+    "name_or_path": "openai/clip-vit-base-patch32",
+    "model_max_length": 77,
+    "special_tokens_map_file": "./special_tokens_map.json",
+    "tokenizer_class": "CLIPTokenizer"
+}

annotator/clip_vision/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

annotator/color/__init__.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import cv2
+def cv2_resize_shortest_edge(image, size):
+    h, w = image.shape[:2]
+    if h < w:
+        new_h = size
+        new_w = int(round(w / h * size))
+    else:
+        new_w = size
+        new_h = int(round(h / w * size))
+    resized_image = cv2.resize(image, (new_w, new_h), interpolation=cv2.INTER_AREA)
+    return resized_image
+def apply_color(img, res=512):
+    img = cv2_resize_shortest_edge(img, res)
+    h, w = img.shape[:2]
+    input_img_color = cv2.resize(img, (w//64, h//64), interpolation=cv2.INTER_CUBIC)
+    input_img_color = cv2.resize(input_img_color, (w, h), interpolation=cv2.INTER_NEAREST)
+    return input_img_color

annotator/hed/__init__.py ADDED Viewed

	@@ -0,0 +1,98 @@

+# This is an improved version and model of HED edge detection with Apache License, Version 2.0.
+# Please use this implementation in your products
+# This implementation may produce slightly different results from Saining Xie's official implementations,
+# but it generates smoother edges and is more suitable for ControlNet as well as other image-to-image translations.
+# Different from official models and other implementations, this is an RGB-input model (rather than BGR)
+# and in this way it works better for gradio's RGB protocol
+import os
+import cv2
+import torch
+import numpy as np
+from einops import rearrange
+import os
+from modules import devices
+from annotator.annotator_path import models_path
+from annotator.util import safe_step, nms
+class DoubleConvBlock(torch.nn.Module):
+    def __init__(self, input_channel, output_channel, layer_number):
+        super().__init__()
+        self.convs = torch.nn.Sequential()
+        self.convs.append(torch.nn.Conv2d(in_channels=input_channel, out_channels=output_channel, kernel_size=(3, 3), stride=(1, 1), padding=1))
+        for i in range(1, layer_number):
+            self.convs.append(torch.nn.Conv2d(in_channels=output_channel, out_channels=output_channel, kernel_size=(3, 3), stride=(1, 1), padding=1))
+        self.projection = torch.nn.Conv2d(in_channels=output_channel, out_channels=1, kernel_size=(1, 1), stride=(1, 1), padding=0)
+    def __call__(self, x, down_sampling=False):
+        h = x
+        if down_sampling:
+            h = torch.nn.functional.max_pool2d(h, kernel_size=(2, 2), stride=(2, 2))
+        for conv in self.convs:
+            h = conv(h)
+            h = torch.nn.functional.relu(h)
+        return h, self.projection(h)
+class ControlNetHED_Apache2(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.norm = torch.nn.Parameter(torch.zeros(size=(1, 3, 1, 1)))
+        self.block1 = DoubleConvBlock(input_channel=3, output_channel=64, layer_number=2)
+        self.block2 = DoubleConvBlock(input_channel=64, output_channel=128, layer_number=2)
+        self.block3 = DoubleConvBlock(input_channel=128, output_channel=256, layer_number=3)
+        self.block4 = DoubleConvBlock(input_channel=256, output_channel=512, layer_number=3)
+        self.block5 = DoubleConvBlock(input_channel=512, output_channel=512, layer_number=3)
+    def __call__(self, x):
+        h = x - self.norm
+        h, projection1 = self.block1(h)
+        h, projection2 = self.block2(h, down_sampling=True)
+        h, projection3 = self.block3(h, down_sampling=True)
+        h, projection4 = self.block4(h, down_sampling=True)
+        h, projection5 = self.block5(h, down_sampling=True)
+        return projection1, projection2, projection3, projection4, projection5
+netNetwork = None
+remote_model_path = "https://huggingface.co/lllyasviel/Annotators/resolve/main/ControlNetHED.pth"
+modeldir = os.path.join(models_path, "hed")
+old_modeldir = os.path.dirname(os.path.realpath(__file__))
+def apply_hed(input_image, is_safe=False):
+    global netNetwork
+    if netNetwork is None:
+        modelpath = os.path.join(modeldir, "ControlNetHED.pth")
+        old_modelpath = os.path.join(old_modeldir, "ControlNetHED.pth")
+        if os.path.exists(old_modelpath):
+            modelpath = old_modelpath
+        elif not os.path.exists(modelpath):
+            from basicsr.utils.download_util import load_file_from_url
+            load_file_from_url(remote_model_path, model_dir=modeldir)
+        netNetwork = ControlNetHED_Apache2().to(devices.get_device_for("controlnet"))
+        netNetwork.load_state_dict(torch.load(modelpath, map_location='cpu'))
+    netNetwork.to(devices.get_device_for("controlnet")).float().eval()
+    assert input_image.ndim == 3
+    H, W, C = input_image.shape
+    with torch.no_grad():
+        image_hed = torch.from_numpy(input_image.copy()).float().to(devices.get_device_for("controlnet"))
+        image_hed = rearrange(image_hed, 'h w c -> 1 c h w')
+        edges = netNetwork(image_hed)
+        edges = [e.detach().cpu().numpy().astype(np.float32)[0, 0] for e in edges]
+        edges = [cv2.resize(e, (W, H), interpolation=cv2.INTER_LINEAR) for e in edges]
+        edges = np.stack(edges, axis=2)
+        edge = 1 / (1 + np.exp(-np.mean(edges, axis=2).astype(np.float64)))
+        if is_safe:
+            edge = safe_step(edge)
+        edge = (edge * 255.0).clip(0, 255).astype(np.uint8)
+        return edge
+def unload_hed_model():
+    global netNetwork
+    if netNetwork is not None:
+        netNetwork.cpu()

annotator/keypose/__init__.py ADDED Viewed

	@@ -0,0 +1,212 @@

+import numpy as np
+import cv2
+import torch
+import os
+from modules import devices
+from annotator.annotator_path import models_path
+import mmcv
+from mmdet.apis import inference_detector, init_detector
+from mmpose.apis import inference_top_down_pose_model
+from mmpose.apis import init_pose_model, process_mmdet_results, vis_pose_result
+def preprocessing(image, device):
+    # Resize
+    scale = 640 / max(image.shape[:2])
+    image = cv2.resize(image, dsize=None, fx=scale, fy=scale)
+    raw_image = image.astype(np.uint8)
+    # Subtract mean values
+    image = image.astype(np.float32)
+    image -= np.array(
+        [
+            float(104.008),
+            float(116.669),
+            float(122.675),
+        ]
+    )
+    # Convert to torch.Tensor and add "batch" axis
+    image = torch.from_numpy(image.transpose(2, 0, 1)).float().unsqueeze(0)
+    image = image.to(device)
+    return image, raw_image
+def imshow_keypoints(img,
+                     pose_result,
+                     skeleton=None,
+                     kpt_score_thr=0.1,
+                     pose_kpt_color=None,
+                     pose_link_color=None,
+                     radius=4,
+                     thickness=1):
+    """Draw keypoints and links on an image.
+    Args:
+            img (ndarry): The image to draw poses on.
+            pose_result (list[kpts]): The poses to draw. Each element kpts is
+                a set of K keypoints as an Kx3 numpy.ndarray, where each
+                keypoint is represented as x, y, score.
+            kpt_score_thr (float, optional): Minimum score of keypoints
+                to be shown. Default: 0.3.
+            pose_kpt_color (np.array[Nx3]`): Color of N keypoints. If None,
+                the keypoint will not be drawn.
+            pose_link_color (np.array[Mx3]): Color of M links. If None, the
+                links will not be drawn.
+            thickness (int): Thickness of lines.
+    """
+    img_h, img_w, _ = img.shape
+    img = np.zeros(img.shape)
+    for idx, kpts in enumerate(pose_result):
+        if idx > 1:
+            continue
+        kpts = kpts['keypoints']
+        # print(kpts)
+        kpts = np.array(kpts, copy=False)
+        # draw each point on image
+        if pose_kpt_color is not None:
+            assert len(pose_kpt_color) == len(kpts)
+            for kid, kpt in enumerate(kpts):
+                x_coord, y_coord, kpt_score = int(kpt[0]), int(kpt[1]), kpt[2]
+                if kpt_score < kpt_score_thr or pose_kpt_color[kid] is None:
+                    # skip the point that should not be drawn
+                    continue
+                color = tuple(int(c) for c in pose_kpt_color[kid])
+                cv2.circle(img, (int(x_coord), int(y_coord)),
+                           radius, color, -1)
+        # draw links
+        if skeleton is not None and pose_link_color is not None:
+            assert len(pose_link_color) == len(skeleton)
+            for sk_id, sk in enumerate(skeleton):
+                pos1 = (int(kpts[sk[0], 0]), int(kpts[sk[0], 1]))
+                pos2 = (int(kpts[sk[1], 0]), int(kpts[sk[1], 1]))
+                if (pos1[0] <= 0 or pos1[0] >= img_w or pos1[1] <= 0 or pos1[1] >= img_h or pos2[0] <= 0
+                        or pos2[0] >= img_w or pos2[1] <= 0 or pos2[1] >= img_h or kpts[sk[0], 2] < kpt_score_thr
+                        or kpts[sk[1], 2] < kpt_score_thr or pose_link_color[sk_id] is None):
+                    # skip the link that should not be drawn
+                    continue
+                color = tuple(int(c) for c in pose_link_color[sk_id])
+                cv2.line(img, pos1, pos2, color, thickness=thickness)
+    return img
+human_det, pose_model = None, None
+det_model_path = "https://download.openmmlab.com/mmdetection/v2.0/faster_rcnn/faster_rcnn_r50_fpn_1x_coco/faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth"
+pose_model_path = "https://download.openmmlab.com/mmpose/top_down/hrnet/hrnet_w48_coco_256x192-b9e0b3ab_20200708.pth"
+modeldir = os.path.join(models_path, "keypose")
+old_modeldir = os.path.dirname(os.path.realpath(__file__))
+det_config = 'faster_rcnn_r50_fpn_coco.py'
+pose_config = 'hrnet_w48_coco_256x192.py'
+det_checkpoint = 'faster_rcnn_r50_fpn_1x_coco_20200130-047c8118.pth'
+pose_checkpoint = 'hrnet_w48_coco_256x192-b9e0b3ab_20200708.pth'
+det_cat_id = 1
+bbox_thr = 0.2
+skeleton = [
+    [15, 13], [13, 11], [16, 14], [14, 12], [11, 12], [5, 11], [6, 12], [5, 6], [5, 7], [6, 8],
+    [7, 9], [8, 10],
+    [1, 2], [0, 1], [0, 2], [1, 3], [2, 4], [3, 5], [4, 6]
+]
+pose_kpt_color = [
+    [51, 153, 255], [51, 153, 255], [51, 153, 255], [51, 153, 255], [51, 153, 255],
+    [0, 255, 0],
+    [255, 128, 0], [0, 255, 0], [255, 128, 0], [0, 255, 0], [255, 128, 0], [0, 255, 0],
+    [255, 128, 0],
+    [0, 255, 0], [255, 128, 0], [0, 255, 0], [255, 128, 0]
+]
+pose_link_color = [
+    [0, 255, 0], [0, 255, 0], [255, 128, 0], [255, 128, 0],
+    [51, 153, 255], [51, 153, 255], [51, 153, 255], [51, 153, 255], [0, 255, 0],
+    [255, 128, 0],
+    [0, 255, 0], [255, 128, 0], [51, 153, 255], [51, 153, 255], [51, 153, 255],
+    [51, 153, 255],
+    [51, 153, 255], [51, 153, 255], [51, 153, 255]
+]
+def find_download_model(checkpoint, remote_path):
+    modelpath = os.path.join(modeldir, checkpoint)
+    old_modelpath = os.path.join(old_modeldir, checkpoint)
+    if os.path.exists(old_modelpath):
+        modelpath = old_modelpath
+    elif not os.path.exists(modelpath):
+        from basicsr.utils.download_util import load_file_from_url
+        load_file_from_url(remote_path, model_dir=modeldir)
+    return modelpath
+def apply_keypose(input_image):
+    global human_det, pose_model
+    if netNetwork is None:
+        det_model_local = find_download_model(det_checkpoint, det_model_path)
+        hrnet_model_local = find_download_model(pose_checkpoint, pose_model_path)
+        det_config_mmcv = mmcv.Config.fromfile(det_config)
+        pose_config_mmcv = mmcv.Config.fromfile(pose_config)
+        human_det = init_detector(det_config_mmcv, det_model_local, device=devices.get_device_for("controlnet"))
+        pose_model = init_pose_model(pose_config_mmcv, hrnet_model_local, device=devices.get_device_for("controlnet"))
+    assert input_image.ndim == 3
+    input_image = input_image.copy()
+    with torch.no_grad():
+        image = torch.from_numpy(input_image).float().to(devices.get_device_for("controlnet"))
+        image = image / 255.0
+        mmdet_results = inference_detector(human_det, image)
+        # keep the person class bounding boxes.
+        person_results = process_mmdet_results(mmdet_results, det_cat_id)
+        return_heatmap = False
+        dataset = pose_model.cfg.data['test']['type']
+        # e.g. use ('backbone', ) to return backbone feature
+        output_layer_names = None
+        pose_results, _ = inference_top_down_pose_model(
+            pose_model,
+            image,
+            person_results,
+            bbox_thr=bbox_thr,
+            format='xyxy',
+            dataset=dataset,
+            dataset_info=None,
+            return_heatmap=return_heatmap,
+            outputs=output_layer_names
+        )
+        im_keypose_out = imshow_keypoints(
+            image,
+            pose_results,
+            skeleton=skeleton,
+            pose_kpt_color=pose_kpt_color,
+            pose_link_color=pose_link_color,
+            radius=2,
+            thickness=2
+        )
+        im_keypose_out = im_keypose_out.astype(np.uint8)
+        # image_hed = rearrange(image_hed, 'h w c -> 1 c h w')
+        # edge = netNetwork(image_hed)[0]
+        # edge = (edge.cpu().numpy() * 255.0).clip(0, 255).astype(np.uint8)
+        return im_keypose_out
+def unload_hed_model():
+    global netNetwork
+    if netNetwork is not None:
+        netNetwork.cpu()

annotator/keypose/faster_rcnn_r50_fpn_coco.py ADDED Viewed

	@@ -0,0 +1,182 @@

+checkpoint_config = dict(interval=1)
+# yapf:disable
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        # dict(type='TensorboardLoggerHook')
+    ])
+# yapf:enable
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
+# optimizer
+optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[8, 11])
+total_epochs = 12
+model = dict(
+    type='FasterRCNN',
+    pretrained='torchvision://resnet50',
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=1,
+        norm_cfg=dict(type='BN', requires_grad=True),
+        norm_eval=True,
+        style='pytorch'),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=5),
+    rpn_head=dict(
+        type='RPNHead',
+        in_channels=256,
+        feat_channels=256,
+        anchor_generator=dict(
+            type='AnchorGenerator',
+            scales=[8],
+            ratios=[0.5, 1.0, 2.0],
+            strides=[4, 8, 16, 32, 64]),
+        bbox_coder=dict(
+            type='DeltaXYWHBBoxCoder',
+            target_means=[.0, .0, .0, .0],
+            target_stds=[1.0, 1.0, 1.0, 1.0]),
+        loss_cls=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
+        loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
+    roi_head=dict(
+        type='StandardRoIHead',
+        bbox_roi_extractor=dict(
+            type='SingleRoIExtractor',
+            roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
+            out_channels=256,
+            featmap_strides=[4, 8, 16, 32]),
+        bbox_head=dict(
+            type='Shared2FCBBoxHead',
+            in_channels=256,
+            fc_out_channels=1024,
+            roi_feat_size=7,
+            num_classes=80,
+            bbox_coder=dict(
+                type='DeltaXYWHBBoxCoder',
+                target_means=[0., 0., 0., 0.],
+                target_stds=[0.1, 0.1, 0.2, 0.2]),
+            reg_class_agnostic=False,
+            loss_cls=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+            loss_bbox=dict(type='L1Loss', loss_weight=1.0))),
+    # model training and testing settings
+    train_cfg=dict(
+        rpn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.7,
+                neg_iou_thr=0.3,
+                min_pos_iou=0.3,
+                match_low_quality=True,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=256,
+                pos_fraction=0.5,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=False),
+            allowed_border=-1,
+            pos_weight=-1,
+            debug=False),
+        rpn_proposal=dict(
+            nms_pre=2000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            assigner=dict(
+                type='MaxIoUAssigner',
+                pos_iou_thr=0.5,
+                neg_iou_thr=0.5,
+                min_pos_iou=0.5,
+                match_low_quality=False,
+                ignore_iof_thr=-1),
+            sampler=dict(
+                type='RandomSampler',
+                num=512,
+                pos_fraction=0.25,
+                neg_pos_ub=-1,
+                add_gt_as_proposals=True),
+            pos_weight=-1,
+            debug=False)),
+    test_cfg=dict(
+        rpn=dict(
+            nms_pre=1000,
+            max_per_img=1000,
+            nms=dict(type='nms', iou_threshold=0.7),
+            min_bbox_size=0),
+        rcnn=dict(
+            score_thr=0.05,
+            nms=dict(type='nms', iou_threshold=0.5),
+            max_per_img=100)
+        # soft-nms is also supported for rcnn testing
+        # e.g., nms=dict(type='soft_nms', iou_threshold=0.5, min_score=0.05)
+    ))
+dataset_type = 'CocoDataset'
+data_root = 'data/coco'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', with_bbox=True),
+    dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size_divisor=32),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(1333, 800),
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='Pad', size_divisor=32),
+            dict(type='DefaultFormatBundle'),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    samples_per_gpu=2,
+    workers_per_gpu=2,
+    train=dict(
+        type=dataset_type,
+        ann_file=f'{data_root}/annotations/instances_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        pipeline=train_pipeline),
+    val=dict(
+        type=dataset_type,
+        ann_file=f'{data_root}/annotations/instances_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        ann_file=f'{data_root}/annotations/instances_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        pipeline=test_pipeline))
+evaluation = dict(interval=1, metric='bbox')

annotator/keypose/hrnet_w48_coco_256x192.py ADDED Viewed

	@@ -0,0 +1,169 @@

+# _base_ = [
+#     '../../../../_base_/default_runtime.py',
+#     '../../../../_base_/datasets/coco.py'
+# ]
+evaluation = dict(interval=10, metric='mAP', save_best='AP')
+optimizer = dict(
+    type='Adam',
+    lr=5e-4,
+)
+optimizer_config = dict(grad_clip=None)
+# learning policy
+lr_config = dict(
+    policy='step',
+    warmup='linear',
+    warmup_iters=500,
+    warmup_ratio=0.001,
+    step=[170, 200])
+total_epochs = 210
+channel_cfg = dict(
+    num_output_channels=17,
+    dataset_joints=17,
+    dataset_channel=[
+        [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
+    ],
+    inference_channel=[
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
+    ])
+# model settings
+model = dict(
+    type='TopDown',
+    pretrained='https://download.openmmlab.com/mmpose/'
+    'pretrain_models/hrnet_w48-8ef0771d.pth',
+    backbone=dict(
+        type='HRNet',
+        in_channels=3,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(48, 96)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(48, 96, 192)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(48, 96, 192, 384))),
+    ),
+    keypoint_head=dict(
+        type='TopdownHeatmapSimpleHead',
+        in_channels=48,
+        out_channels=channel_cfg['num_output_channels'],
+        num_deconv_layers=0,
+        extra=dict(final_conv_kernel=1, ),
+        loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
+    train_cfg=dict(),
+    test_cfg=dict(
+        flip_test=True,
+        post_process='default',
+        shift_heatmap=True,
+        modulate_kernel=11))
+data_cfg = dict(
+    image_size=[192, 256],
+    heatmap_size=[48, 64],
+    num_output_channels=channel_cfg['num_output_channels'],
+    num_joints=channel_cfg['dataset_joints'],
+    dataset_channel=channel_cfg['dataset_channel'],
+    inference_channel=channel_cfg['inference_channel'],
+    soft_nms=False,
+    nms_thr=1.0,
+    oks_thr=0.9,
+    vis_thr=0.2,
+    use_gt_bbox=False,
+    det_bbox_thr=0.0,
+    bbox_file='data/coco/person_detection_results/'
+    'COCO_val2017_detections_AP_H_56_person.json',
+)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownGetBboxCenterScale', padding=1.25),
+    dict(type='TopDownRandomShiftBboxCenter', shift_factor=0.16, prob=0.3),
+    dict(type='TopDownRandomFlip', flip_prob=0.5),
+    dict(
+        type='TopDownHalfBodyTransform',
+        num_joints_half_body=8,
+        prob_half_body=0.3),
+    dict(
+        type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(type='TopDownGenerateTarget', sigma=2),
+    dict(
+        type='Collect',
+        keys=['img', 'target', 'target_weight'],
+        meta_keys=[
+            'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
+            'rotation', 'bbox_score', 'flip_pairs'
+        ]),
+]
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='TopDownGetBboxCenterScale', padding=1.25),
+    dict(type='TopDownAffine'),
+    dict(type='ToTensor'),
+    dict(
+        type='NormalizeTensor',
+        mean=[0.485, 0.456, 0.406],
+        std=[0.229, 0.224, 0.225]),
+    dict(
+        type='Collect',
+        keys=['img'],
+        meta_keys=[
+            'image_file', 'center', 'scale', 'rotation', 'bbox_score',
+            'flip_pairs'
+        ]),
+]
+test_pipeline = val_pipeline
+data_root = 'data/coco'
+data = dict(
+    samples_per_gpu=32,
+    workers_per_gpu=2,
+    val_dataloader=dict(samples_per_gpu=32),
+    test_dataloader=dict(samples_per_gpu=32),
+    train=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
+        img_prefix=f'{data_root}/train2017/',
+        data_cfg=data_cfg,
+        pipeline=train_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    val=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=val_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+    test=dict(
+        type='TopDownCocoDataset',
+        ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
+        img_prefix=f'{data_root}/val2017/',
+        data_cfg=data_cfg,
+        pipeline=test_pipeline,
+        dataset_info={{_base_.dataset_info}}),
+)

annotator/lama/__init__.py ADDED Viewed

	@@ -0,0 +1,58 @@

+# https://github.com/advimman/lama
+import yaml
+import torch
+from omegaconf import OmegaConf
+import numpy as np
+from einops import rearrange
+import os
+from modules import devices
+from annotator.annotator_path import models_path
+from annotator.lama.saicinpainting.training.trainers import load_checkpoint
+class LamaInpainting:
+    model_dir = os.path.join(models_path, "lama")
+    def __init__(self):
+        self.model = None
+        self.device = devices.get_device_for("controlnet")
+    def load_model(self):
+        remote_model_path = "https://huggingface.co/lllyasviel/Annotators/resolve/main/ControlNetLama.pth"
+        modelpath = os.path.join(self.model_dir, "ControlNetLama.pth")
+        if not os.path.exists(modelpath):
+            from basicsr.utils.download_util import load_file_from_url
+            load_file_from_url(remote_model_path, model_dir=self.model_dir)
+        config_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'config.yaml')
+        cfg = yaml.safe_load(open(config_path, 'rt'))
+        cfg = OmegaConf.create(cfg)
+        cfg.training_model.predict_only = True
+        cfg.visualizer.kind = 'noop'
+        self.model = load_checkpoint(cfg, os.path.abspath(modelpath), strict=False, map_location='cpu')
+        self.model = self.model.to(self.device)
+        self.model.eval()
+    def unload_model(self):
+        if self.model is not None:
+            self.model.cpu()
+    def __call__(self, input_image):
+        if self.model is None:
+            self.load_model()
+        self.model.to(self.device)
+        color = np.ascontiguousarray(input_image[:, :, 0:3]).astype(np.float32) / 255.0
+        mask = np.ascontiguousarray(input_image[:, :, 3:4]).astype(np.float32) / 255.0
+        with torch.no_grad():
+            color = torch.from_numpy(color).float().to(self.device)
+            mask = torch.from_numpy(mask).float().to(self.device)
+            mask = (mask > 0.5).float()
+            color = color * (1 - mask)
+            image_feed = torch.cat([color, mask], dim=2)
+            image_feed = rearrange(image_feed, 'h w c -> 1 c h w')
+            result = self.model(image_feed)[0]
+            result = rearrange(result, 'c h w -> h w c')
+            result = result * mask + color * (1 - mask)
+            result *= 255.0
+            return result.detach().cpu().numpy().clip(0, 255).astype(np.uint8)

annotator/lama/config.yaml ADDED Viewed

	@@ -0,0 +1,157 @@

+run_title: b18_ffc075_batch8x15
+training_model:
+  kind: default
+  visualize_each_iters: 1000
+  concat_mask: true
+  store_discr_outputs_for_vis: true
+losses:
+  l1:
+    weight_missing: 0
+    weight_known: 10
+  perceptual:
+    weight: 0
+  adversarial:
+    kind: r1
+    weight: 10
+    gp_coef: 0.001
+    mask_as_fake_target: true
+    allow_scale_mask: true
+  feature_matching:
+    weight: 100
+  resnet_pl:
+    weight: 30
+    weights_path: ${env:TORCH_HOME}
+optimizers:
+  generator:
+    kind: adam
+    lr: 0.001
+  discriminator:
+    kind: adam
+    lr: 0.0001
+visualizer:
+  key_order:
+  - image
+  - predicted_image
+  - discr_output_fake
+  - discr_output_real
+  - inpainted
+  rescale_keys:
+  - discr_output_fake
+  - discr_output_real
+  kind: directory
+  outdir: /group-volume/User-Driven-Content-Generation/r.suvorov/inpainting/experiments/r.suvorov_2021-04-30_14-41-12_train_simple_pix2pix2_gap_sdpl_novgg_large_b18_ffc075_batch8x15/samples
+location:
+  data_root_dir: /group-volume/User-Driven-Content-Generation/datasets/inpainting_data_root_large
+  out_root_dir: /group-volume/User-Driven-Content-Generation/${env:USER}/inpainting/experiments
+  tb_dir: /group-volume/User-Driven-Content-Generation/${env:USER}/inpainting/tb_logs
+data:
+  batch_size: 15
+  val_batch_size: 2
+  num_workers: 3
+  train:
+    indir: ${location.data_root_dir}/train
+    out_size: 256
+    mask_gen_kwargs:
+      irregular_proba: 1
+      irregular_kwargs:
+        max_angle: 4
+        max_len: 200
+        max_width: 100
+        max_times: 5
+        min_times: 1
+      box_proba: 1
+      box_kwargs:
+        margin: 10
+        bbox_min_size: 30
+        bbox_max_size: 150
+        max_times: 3
+        min_times: 1
+      segm_proba: 0
+      segm_kwargs:
+        confidence_threshold: 0.5
+        max_object_area: 0.5
+        min_mask_area: 0.07
+        downsample_levels: 6
+        num_variants_per_mask: 1
+        rigidness_mode: 1
+        max_foreground_coverage: 0.3
+        max_foreground_intersection: 0.7
+        max_mask_intersection: 0.1
+        max_hidden_area: 0.1
+        max_scale_change: 0.25
+        horizontal_flip: true
+        max_vertical_shift: 0.2
+        position_shuffle: true
+    transform_variant: distortions
+    dataloader_kwargs:
+      batch_size: ${data.batch_size}
+      shuffle: true
+      num_workers: ${data.num_workers}
+  val:
+    indir: ${location.data_root_dir}/val
+    img_suffix: .png
+    dataloader_kwargs:
+      batch_size: ${data.val_batch_size}
+      shuffle: false
+      num_workers: ${data.num_workers}
+  visual_test:
+    indir: ${location.data_root_dir}/korean_test
+    img_suffix: _input.png
+    pad_out_to_modulo: 32
+    dataloader_kwargs:
+      batch_size: 1
+      shuffle: false
+      num_workers: ${data.num_workers}
+generator:
+  kind: ffc_resnet
+  input_nc: 4
+  output_nc: 3
+  ngf: 64
+  n_downsampling: 3
+  n_blocks: 18
+  add_out_act: sigmoid
+  init_conv_kwargs:
+    ratio_gin: 0
+    ratio_gout: 0
+    enable_lfu: false
+  downsample_conv_kwargs:
+    ratio_gin: ${generator.init_conv_kwargs.ratio_gout}
+    ratio_gout: ${generator.downsample_conv_kwargs.ratio_gin}
+    enable_lfu: false
+  resnet_conv_kwargs:
+    ratio_gin: 0.75
+    ratio_gout: ${generator.resnet_conv_kwargs.ratio_gin}
+    enable_lfu: false
+discriminator:
+  kind: pix2pixhd_nlayer
+  input_nc: 3
+  ndf: 64
+  n_layers: 4
+evaluator:
+  kind: default
+  inpainted_key: inpainted
+  integral_kind: ssim_fid100_f1
+trainer:
+  kwargs:
+    gpus: -1
+    accelerator: ddp
+    max_epochs: 200
+    gradient_clip_val: 1
+    log_gpu_memory: None
+    limit_train_batches: 25000
+    val_check_interval: ${trainer.kwargs.limit_train_batches}
+    log_every_n_steps: 1000
+    precision: 32
+    terminate_on_nan: false
+    check_val_every_n_epoch: 1
+    num_sanity_val_steps: 8
+    limit_val_batches: 1000
+    replace_sampler_ddp: false
+  checkpoint_kwargs:
+    verbose: true
+    save_top_k: 5
+    save_last: true
+    period: 1
+    monitor: val_ssim_fid100_f1_total_mean
+    mode: max

annotator/lama/saicinpainting/__init__.py ADDED Viewed

File without changes

annotator/lama/saicinpainting/training/__init__.py ADDED Viewed

File without changes

annotator/lama/saicinpainting/training/data/__init__.py ADDED Viewed

File without changes

annotator/lama/saicinpainting/training/data/masks.py ADDED Viewed

	@@ -0,0 +1,332 @@

+import math
+import random
+import hashlib
+import logging
+from enum import Enum
+import cv2
+import numpy as np
+# from annotator.lama.saicinpainting.evaluation.masks.mask import SegmentationMask
+from annotator.lama.saicinpainting.utils import LinearRamp
+LOGGER = logging.getLogger(__name__)
+class DrawMethod(Enum):
+    LINE = 'line'
+    CIRCLE = 'circle'
+    SQUARE = 'square'
+def make_random_irregular_mask(shape, max_angle=4, max_len=60, max_width=20, min_times=0, max_times=10,
+                               draw_method=DrawMethod.LINE):
+    draw_method = DrawMethod(draw_method)
+    height, width = shape
+    mask = np.zeros((height, width), np.float32)
+    times = np.random.randint(min_times, max_times + 1)
+    for i in range(times):
+        start_x = np.random.randint(width)
+        start_y = np.random.randint(height)
+        for j in range(1 + np.random.randint(5)):
+            angle = 0.01 + np.random.randint(max_angle)
+            if i % 2 == 0:
+                angle = 2 * 3.1415926 - angle
+            length = 10 + np.random.randint(max_len)
+            brush_w = 5 + np.random.randint(max_width)
+            end_x = np.clip((start_x + length * np.sin(angle)).astype(np.int32), 0, width)
+            end_y = np.clip((start_y + length * np.cos(angle)).astype(np.int32), 0, height)
+            if draw_method == DrawMethod.LINE:
+                cv2.line(mask, (start_x, start_y), (end_x, end_y), 1.0, brush_w)
+            elif draw_method == DrawMethod.CIRCLE:
+                cv2.circle(mask, (start_x, start_y), radius=brush_w, color=1., thickness=-1)
+            elif draw_method == DrawMethod.SQUARE:
+                radius = brush_w // 2
+                mask[start_y - radius:start_y + radius, start_x - radius:start_x + radius] = 1
+            start_x, start_y = end_x, end_y
+    return mask[None, ...]
+class RandomIrregularMaskGenerator:
+    def __init__(self, max_angle=4, max_len=60, max_width=20, min_times=0, max_times=10, ramp_kwargs=None,
+                 draw_method=DrawMethod.LINE):
+        self.max_angle = max_angle
+        self.max_len = max_len
+        self.max_width = max_width
+        self.min_times = min_times
+        self.max_times = max_times
+        self.draw_method = draw_method
+        self.ramp = LinearRamp(**ramp_kwargs) if ramp_kwargs is not None else None
+    def __call__(self, img, iter_i=None, raw_image=None):
+        coef = self.ramp(iter_i) if (self.ramp is not None) and (iter_i is not None) else 1
+        cur_max_len = int(max(1, self.max_len * coef))
+        cur_max_width = int(max(1, self.max_width * coef))
+        cur_max_times = int(self.min_times + 1 + (self.max_times - self.min_times) * coef)
+        return make_random_irregular_mask(img.shape[1:], max_angle=self.max_angle, max_len=cur_max_len,
+                                          max_width=cur_max_width, min_times=self.min_times, max_times=cur_max_times,
+                                          draw_method=self.draw_method)
+def make_random_rectangle_mask(shape, margin=10, bbox_min_size=30, bbox_max_size=100, min_times=0, max_times=3):
+    height, width = shape
+    mask = np.zeros((height, width), np.float32)
+    bbox_max_size = min(bbox_max_size, height - margin * 2, width - margin * 2)
+    times = np.random.randint(min_times, max_times + 1)
+    for i in range(times):
+        box_width = np.random.randint(bbox_min_size, bbox_max_size)
+        box_height = np.random.randint(bbox_min_size, bbox_max_size)
+        start_x = np.random.randint(margin, width - margin - box_width + 1)
+        start_y = np.random.randint(margin, height - margin - box_height + 1)
+        mask[start_y:start_y + box_height, start_x:start_x + box_width] = 1
+    return mask[None, ...]
+class RandomRectangleMaskGenerator:
+    def __init__(self, margin=10, bbox_min_size=30, bbox_max_size=100, min_times=0, max_times=3, ramp_kwargs=None):
+        self.margin = margin
+        self.bbox_min_size = bbox_min_size
+        self.bbox_max_size = bbox_max_size
+        self.min_times = min_times
+        self.max_times = max_times
+        self.ramp = LinearRamp(**ramp_kwargs) if ramp_kwargs is not None else None
+    def __call__(self, img, iter_i=None, raw_image=None):
+        coef = self.ramp(iter_i) if (self.ramp is not None) and (iter_i is not None) else 1
+        cur_bbox_max_size = int(self.bbox_min_size + 1 + (self.bbox_max_size - self.bbox_min_size) * coef)
+        cur_max_times = int(self.min_times + (self.max_times - self.min_times) * coef)
+        return make_random_rectangle_mask(img.shape[1:], margin=self.margin, bbox_min_size=self.bbox_min_size,
+                                          bbox_max_size=cur_bbox_max_size, min_times=self.min_times,
+                                          max_times=cur_max_times)
+class RandomSegmentationMaskGenerator:
+    def __init__(self, **kwargs):
+        self.impl = None  # will be instantiated in first call (effectively in subprocess)
+        self.kwargs = kwargs
+    def __call__(self, img, iter_i=None, raw_image=None):
+        if self.impl is None:
+            self.impl = SegmentationMask(**self.kwargs)
+        masks = self.impl.get_masks(np.transpose(img, (1, 2, 0)))
+        masks = [m for m in masks if len(np.unique(m)) > 1]
+        return np.random.choice(masks)
+def make_random_superres_mask(shape, min_step=2, max_step=4, min_width=1, max_width=3):
+    height, width = shape
+    mask = np.zeros((height, width), np.float32)
+    step_x = np.random.randint(min_step, max_step + 1)
+    width_x = np.random.randint(min_width, min(step_x, max_width + 1))
+    offset_x = np.random.randint(0, step_x)
+    step_y = np.random.randint(min_step, max_step + 1)
+    width_y = np.random.randint(min_width, min(step_y, max_width + 1))
+    offset_y = np.random.randint(0, step_y)
+    for dy in range(width_y):
+        mask[offset_y + dy::step_y] = 1
+    for dx in range(width_x):
+        mask[:, offset_x + dx::step_x] = 1
+    return mask[None, ...]
+class RandomSuperresMaskGenerator:
+    def __init__(self, **kwargs):
+        self.kwargs = kwargs
+    def __call__(self, img, iter_i=None):
+        return make_random_superres_mask(img.shape[1:], **self.kwargs)
+class DumbAreaMaskGenerator:
+    min_ratio = 0.1
+    max_ratio = 0.35
+    default_ratio = 0.225
+    def __init__(self, is_training):
+        #Parameters:
+        #    is_training(bool): If true - random rectangular mask, if false - central square mask
+        self.is_training = is_training
+    def _random_vector(self, dimension):
+        if self.is_training:
+            lower_limit = math.sqrt(self.min_ratio)
+            upper_limit = math.sqrt(self.max_ratio)
+            mask_side = round((random.random() * (upper_limit - lower_limit) + lower_limit) * dimension)
+            u = random.randint(0, dimension-mask_side-1)
+            v = u+mask_side
+        else:
+            margin = (math.sqrt(self.default_ratio) / 2) * dimension
+            u = round(dimension/2 - margin)
+            v = round(dimension/2 + margin)
+        return u, v
+    def __call__(self, img, iter_i=None, raw_image=None):
+        c, height, width = img.shape
+        mask = np.zeros((height, width), np.float32)
+        x1, x2 = self._random_vector(width)
+        y1, y2 = self._random_vector(height)
+        mask[x1:x2, y1:y2] = 1
+        return mask[None, ...]
+class OutpaintingMaskGenerator:
+    def __init__(self, min_padding_percent:float=0.04, max_padding_percent:int=0.25, left_padding_prob:float=0.5, top_padding_prob:float=0.5,
+                 right_padding_prob:float=0.5, bottom_padding_prob:float=0.5, is_fixed_randomness:bool=False):
+        """
+        is_fixed_randomness - get identical paddings for the same image if args are the same
+        """
+        self.min_padding_percent = min_padding_percent
+        self.max_padding_percent = max_padding_percent
+        self.probs = [left_padding_prob, top_padding_prob, right_padding_prob, bottom_padding_prob]
+        self.is_fixed_randomness = is_fixed_randomness
+        assert self.min_padding_percent <= self.max_padding_percent
+        assert self.max_padding_percent > 0
+        assert len([x for x in [self.min_padding_percent, self.max_padding_percent] if (x>=0 and x<=1)]) == 2, f"Padding percentage should be in [0,1]"
+        assert sum(self.probs) > 0, f"At least one of the padding probs should be greater than 0 - {self.probs}"
+        assert len([x for x in self.probs if (x >= 0) and (x <= 1)]) == 4, f"At least one of padding probs is not in [0,1] - {self.probs}"
+        if len([x for x in self.probs if x > 0]) == 1:
+            LOGGER.warning(f"Only one padding prob is greater than zero - {self.probs}. That means that the outpainting masks will be always on the same side")
+    def apply_padding(self, mask, coord):
+        mask[int(coord[0][0]*self.img_h):int(coord[1][0]*self.img_h),
+             int(coord[0][1]*self.img_w):int(coord[1][1]*self.img_w)] = 1
+        return mask
+    def get_padding(self, size):
+        n1 = int(self.min_padding_percent*size)
+        n2 = int(self.max_padding_percent*size)
+        return self.rnd.randint(n1, n2) / size
+    @staticmethod
+    def _img2rs(img):
+        arr = np.ascontiguousarray(img.astype(np.uint8))
+        str_hash = hashlib.sha1(arr).hexdigest()
+        res = hash(str_hash)%(2**32)
+        return res
+    def __call__(self, img, iter_i=None, raw_image=None):
+        c, self.img_h, self.img_w = img.shape
+        mask = np.zeros((self.img_h, self.img_w), np.float32)
+        at_least_one_mask_applied = False
+        if self.is_fixed_randomness:
+            assert raw_image is not None, f"Cant calculate hash on raw_image=None"
+            rs = self._img2rs(raw_image)
+            self.rnd = np.random.RandomState(rs)
+        else:
+            self.rnd = np.random
+        coords = [[
+                   (0,0),
+                   (1,self.get_padding(size=self.img_h))
+                  ],
+                  [
+                    (0,0),
+                    (self.get_padding(size=self.img_w),1)
+                  ],
+                  [
+                    (0,1-self.get_padding(size=self.img_h)),
+                    (1,1)
+                  ],
+                  [
+                    (1-self.get_padding(size=self.img_w),0),
+                    (1,1)
+                  ]]
+        for pp, coord in zip(self.probs, coords):
+            if self.rnd.random() < pp:
+                at_least_one_mask_applied = True
+                mask = self.apply_padding(mask=mask, coord=coord)
+        if not at_least_one_mask_applied:
+            idx = self.rnd.choice(range(len(coords)), p=np.array(self.probs)/sum(self.probs))
+            mask = self.apply_padding(mask=mask, coord=coords[idx])
+        return mask[None, ...]
+class MixedMaskGenerator:
+    def __init__(self, irregular_proba=1/3, irregular_kwargs=None,
+                 box_proba=1/3, box_kwargs=None,
+                 segm_proba=1/3, segm_kwargs=None,
+                 squares_proba=0, squares_kwargs=None,
+                 superres_proba=0, superres_kwargs=None,
+                 outpainting_proba=0, outpainting_kwargs=None,
+                 invert_proba=0):
+        self.probas = []
+        self.gens = []
+        if irregular_proba > 0:
+            self.probas.append(irregular_proba)
+            if irregular_kwargs is None:
+                irregular_kwargs = {}
+            else:
+                irregular_kwargs = dict(irregular_kwargs)
+            irregular_kwargs['draw_method'] = DrawMethod.LINE
+            self.gens.append(RandomIrregularMaskGenerator(**irregular_kwargs))
+        if box_proba > 0:
+            self.probas.append(box_proba)
+            if box_kwargs is None:
+                box_kwargs = {}
+            self.gens.append(RandomRectangleMaskGenerator(**box_kwargs))
+        if segm_proba > 0:
+            self.probas.append(segm_proba)
+            if segm_kwargs is None:
+                segm_kwargs = {}
+            self.gens.append(RandomSegmentationMaskGenerator(**segm_kwargs))
+        if squares_proba > 0:
+            self.probas.append(squares_proba)
+            if squares_kwargs is None:
+                squares_kwargs = {}
+            else:
+                squares_kwargs = dict(squares_kwargs)
+            squares_kwargs['draw_method'] = DrawMethod.SQUARE
+            self.gens.append(RandomIrregularMaskGenerator(**squares_kwargs))
+        if superres_proba > 0:
+            self.probas.append(superres_proba)
+            if superres_kwargs is None:
+                superres_kwargs = {}
+            self.gens.append(RandomSuperresMaskGenerator(**superres_kwargs))
+        if outpainting_proba > 0:
+            self.probas.append(outpainting_proba)
+            if outpainting_kwargs is None:
+                outpainting_kwargs = {}
+            self.gens.append(OutpaintingMaskGenerator(**outpainting_kwargs))
+        self.probas = np.array(self.probas, dtype='float32')
+        self.probas /= self.probas.sum()
+        self.invert_proba = invert_proba
+    def __call__(self, img, iter_i=None, raw_image=None):
+        kind = np.random.choice(len(self.probas), p=self.probas)
+        gen = self.gens[kind]
+        result = gen(img, iter_i=iter_i, raw_image=raw_image)
+        if self.invert_proba > 0 and random.random() < self.invert_proba:
+            result = 1 - result
+        return result
+def get_mask_generator(kind, kwargs):
+    if kind is None:
+        kind = "mixed"
+    if kwargs is None:
+        kwargs = {}
+    if kind == "mixed":
+        cl = MixedMaskGenerator
+    elif kind == "outpainting":
+        cl = OutpaintingMaskGenerator
+    elif kind == "dumb":
+        cl = DumbAreaMaskGenerator
+    else:
+        raise NotImplementedError(f"No such generator kind = {kind}")
+    return cl(**kwargs)

annotator/lama/saicinpainting/training/losses/__init__.py ADDED Viewed

File without changes

annotator/lama/saicinpainting/training/losses/adversarial.py ADDED Viewed

	@@ -0,0 +1,177 @@

+from typing import Tuple, Dict, Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class BaseAdversarialLoss:
+    def pre_generator_step(self, real_batch: torch.Tensor, fake_batch: torch.Tensor,
+                           generator: nn.Module, discriminator: nn.Module):
+        """
+        Prepare for generator step
+        :param real_batch: Tensor, a batch of real samples
+        :param fake_batch: Tensor, a batch of samples produced by generator
+        :param generator:
+        :param discriminator:
+        :return: None
+        """
+    def pre_discriminator_step(self, real_batch: torch.Tensor, fake_batch: torch.Tensor,
+                               generator: nn.Module, discriminator: nn.Module):
+        """
+        Prepare for discriminator step
+        :param real_batch: Tensor, a batch of real samples
+        :param fake_batch: Tensor, a batch of samples produced by generator
+        :param generator:
+        :param discriminator:
+        :return: None
+        """
+    def generator_loss(self, real_batch: torch.Tensor, fake_batch: torch.Tensor,
+                       discr_real_pred: torch.Tensor, discr_fake_pred: torch.Tensor,
+                       mask: Optional[torch.Tensor] = None) \
+            -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
+        """
+        Calculate generator loss
+        :param real_batch: Tensor, a batch of real samples
+        :param fake_batch: Tensor, a batch of samples produced by generator
+        :param discr_real_pred: Tensor, discriminator output for real_batch
+        :param discr_fake_pred: Tensor, discriminator output for fake_batch
+        :param mask: Tensor, actual mask, which was at input of generator when making fake_batch
+        :return: total generator loss along with some values that might be interesting to log
+        """
+        raise NotImplemented()
+    def discriminator_loss(self, real_batch: torch.Tensor, fake_batch: torch.Tensor,
+                           discr_real_pred: torch.Tensor, discr_fake_pred: torch.Tensor,
+                           mask: Optional[torch.Tensor] = None) \
+            -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
+        """
+        Calculate discriminator loss and call .backward() on it
+        :param real_batch: Tensor, a batch of real samples
+        :param fake_batch: Tensor, a batch of samples produced by generator
+        :param discr_real_pred: Tensor, discriminator output for real_batch
+        :param discr_fake_pred: Tensor, discriminator output for fake_batch
+        :param mask: Tensor, actual mask, which was at input of generator when making fake_batch
+        :return: total discriminator loss along with some values that might be interesting to log
+        """
+        raise NotImplemented()
+    def interpolate_mask(self, mask, shape):
+        assert mask is not None
+        assert self.allow_scale_mask or shape == mask.shape[-2:]
+        if shape != mask.shape[-2:] and self.allow_scale_mask:
+            if self.mask_scale_mode == 'maxpool':
+                mask = F.adaptive_max_pool2d(mask, shape)
+            else:
+                mask = F.interpolate(mask, size=shape, mode=self.mask_scale_mode)
+        return mask
+def make_r1_gp(discr_real_pred, real_batch):
+    if torch.is_grad_enabled():
+        grad_real = torch.autograd.grad(outputs=discr_real_pred.sum(), inputs=real_batch, create_graph=True)[0]
+        grad_penalty = (grad_real.view(grad_real.shape[0], -1).norm(2, dim=1) ** 2).mean()
+    else:
+        grad_penalty = 0
+    real_batch.requires_grad = False
+    return grad_penalty
+class NonSaturatingWithR1(BaseAdversarialLoss):
+    def __init__(self, gp_coef=5, weight=1, mask_as_fake_target=False, allow_scale_mask=False,
+                 mask_scale_mode='nearest', extra_mask_weight_for_gen=0,
+                 use_unmasked_for_gen=True, use_unmasked_for_discr=True):
+        self.gp_coef = gp_coef
+        self.weight = weight
+        # use for discr => use for gen;
+        # otherwise we teach only the discr to pay attention to very small difference
+        assert use_unmasked_for_gen or (not use_unmasked_for_discr)
+        # mask as target => use unmasked for discr:
+        # if we don't care about unmasked regions at all
+        # then it doesn't matter if the value of mask_as_fake_target is true or false
+        assert use_unmasked_for_discr or (not mask_as_fake_target)
+        self.use_unmasked_for_gen = use_unmasked_for_gen
+        self.use_unmasked_for_discr = use_unmasked_for_discr
+        self.mask_as_fake_target = mask_as_fake_target
+        self.allow_scale_mask = allow_scale_mask
+        self.mask_scale_mode = mask_scale_mode
+        self.extra_mask_weight_for_gen = extra_mask_weight_for_gen
+    def generator_loss(self, real_batch: torch.Tensor, fake_batch: torch.Tensor,
+                       discr_real_pred: torch.Tensor, discr_fake_pred: torch.Tensor,
+                       mask=None) \
+            -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
+        fake_loss = F.softplus(-discr_fake_pred)
+        if (self.mask_as_fake_target and self.extra_mask_weight_for_gen > 0) or \
+                not self.use_unmasked_for_gen:  # == if masked region should be treated differently
+            mask = self.interpolate_mask(mask, discr_fake_pred.shape[-2:])
+            if not self.use_unmasked_for_gen:
+                fake_loss = fake_loss * mask
+            else:
+                pixel_weights = 1 + mask * self.extra_mask_weight_for_gen
+                fake_loss = fake_loss * pixel_weights
+        return fake_loss.mean() * self.weight, dict()
+    def pre_discriminator_step(self, real_batch: torch.Tensor, fake_batch: torch.Tensor,
+                               generator: nn.Module, discriminator: nn.Module):
+        real_batch.requires_grad = True
+    def discriminator_loss(self, real_batch: torch.Tensor, fake_batch: torch.Tensor,
+                           discr_real_pred: torch.Tensor, discr_fake_pred: torch.Tensor,
+                           mask=None) \
+            -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
+        real_loss = F.softplus(-discr_real_pred)
+        grad_penalty = make_r1_gp(discr_real_pred, real_batch) * self.gp_coef
+        fake_loss = F.softplus(discr_fake_pred)
+        if not self.use_unmasked_for_discr or self.mask_as_fake_target:
+            # == if masked region should be treated differently
+            mask = self.interpolate_mask(mask, discr_fake_pred.shape[-2:])
+            # use_unmasked_for_discr=False only makes sense for fakes;
+            # for reals there is no difference beetween two regions
+            fake_loss = fake_loss * mask
+            if self.mask_as_fake_target:
+                fake_loss = fake_loss + (1 - mask) * F.softplus(-discr_fake_pred)
+        sum_discr_loss = real_loss + grad_penalty + fake_loss
+        metrics = dict(discr_real_out=discr_real_pred.mean(),
+                       discr_fake_out=discr_fake_pred.mean(),
+                       discr_real_gp=grad_penalty)
+        return sum_discr_loss.mean(), metrics
+class BCELoss(BaseAdversarialLoss):
+    def __init__(self, weight):
+        self.weight = weight
+        self.bce_loss = nn.BCEWithLogitsLoss()
+    def generator_loss(self, discr_fake_pred: torch.Tensor) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
+        real_mask_gt = torch.zeros(discr_fake_pred.shape).to(discr_fake_pred.device)
+        fake_loss = self.bce_loss(discr_fake_pred, real_mask_gt) * self.weight
+        return fake_loss, dict()
+    def pre_discriminator_step(self, real_batch: torch.Tensor, fake_batch: torch.Tensor,
+                               generator: nn.Module, discriminator: nn.Module):
+        real_batch.requires_grad = True
+    def discriminator_loss(self,
+                           mask: torch.Tensor,
+                           discr_real_pred: torch.Tensor,
+                           discr_fake_pred: torch.Tensor) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
+        real_mask_gt = torch.zeros(discr_real_pred.shape).to(discr_real_pred.device)
+        sum_discr_loss = (self.bce_loss(discr_real_pred, real_mask_gt) +  self.bce_loss(discr_fake_pred, mask)) / 2
+        metrics = dict(discr_real_out=discr_real_pred.mean(),
+                       discr_fake_out=discr_fake_pred.mean(),
+                       discr_real_gp=0)
+        return sum_discr_loss, metrics
+def make_discrim_loss(kind, **kwargs):
+    if kind == 'r1':
+        return NonSaturatingWithR1(**kwargs)
+    elif kind == 'bce':
+        return BCELoss(**kwargs)
+    raise ValueError(f'Unknown adversarial loss kind {kind}')

annotator/lama/saicinpainting/training/losses/constants.py ADDED Viewed

	@@ -0,0 +1,152 @@

+weights = {"ade20k":
+    [6.34517766497462,
+    9.328358208955224,
+    11.389521640091116,
+    16.10305958132045,
+    20.833333333333332,
+    22.22222222222222,
+    25.125628140703515,
+    43.29004329004329,
+    50.5050505050505,
+    54.6448087431694,
+    55.24861878453038,
+    60.24096385542168,
+    62.5,
+    66.2251655629139,
+    84.74576271186442,
+    90.90909090909092,
+    91.74311926605505,
+    96.15384615384616,
+    96.15384615384616,
+    97.08737864077669,
+    102.04081632653062,
+    135.13513513513513,
+    149.2537313432836,
+    153.84615384615384,
+    163.93442622950818,
+    166.66666666666666,
+    188.67924528301887,
+    192.30769230769232,
+    217.3913043478261,
+    227.27272727272725,
+    227.27272727272725,
+    227.27272727272725,
+    303.03030303030306,
+    322.5806451612903,
+    333.3333333333333,
+    370.3703703703703,
+    384.61538461538464,
+    416.6666666666667,
+    416.6666666666667,
+    434.7826086956522,
+    434.7826086956522,
+    454.5454545454545,
+    454.5454545454545,
+    500.0,
+    526.3157894736842,
+    526.3157894736842,
+    555.5555555555555,
+    555.5555555555555,
+    555.5555555555555,
+    555.5555555555555,
+    555.5555555555555,
+    555.5555555555555,
+    555.5555555555555,
+    588.2352941176471,
+    588.2352941176471,
+    588.2352941176471,
+    588.2352941176471,
+    588.2352941176471,
+    666.6666666666666,
+    666.6666666666666,
+    666.6666666666666,
+    666.6666666666666,
+    714.2857142857143,
+    714.2857142857143,
+    714.2857142857143,
+    714.2857142857143,
+    714.2857142857143,
+    769.2307692307693,
+    769.2307692307693,
+    769.2307692307693,
+    833.3333333333334,
+    833.3333333333334,
+    833.3333333333334,
+    833.3333333333334,
+    909.090909090909,
+    1000.0,
+    1111.111111111111,
+    1111.111111111111,
+    1111.111111111111,
+    1111.111111111111,
+    1111.111111111111,
+    1250.0,
+    1250.0,
+    1250.0,
+    1250.0,
+    1250.0,
+    1428.5714285714287,
+    1428.5714285714287,
+    1428.5714285714287,
+    1428.5714285714287,
+    1428.5714285714287,
+    1428.5714285714287,
+    1428.5714285714287,
+    1666.6666666666667,
+    1666.6666666666667,
+    1666.6666666666667,
+    1666.6666666666667,
+    1666.6666666666667,
+    1666.6666666666667,
+    1666.6666666666667,
+    1666.6666666666667,
+    1666.6666666666667,
+    1666.6666666666667,
+    1666.6666666666667,
+    2000.0,
+    2000.0,
+    2000.0,
+    2000.0,
+    2000.0,
+    2000.0,
+    2000.0,
+    2000.0,
+    2000.0,
+    2000.0,
+    2000.0,
+    2000.0,
+    2000.0,
+    2000.0,
+    2000.0,
+    2000.0,
+    2000.0,
+    2500.0,
+    2500.0,
+    2500.0,
+    2500.0,
+    2500.0,
+    2500.0,
+    2500.0,
+    2500.0,
+    2500.0,
+    2500.0,
+    2500.0,
+    2500.0,
+    2500.0,
+    3333.3333333333335,
+    3333.3333333333335,
+    3333.3333333333335,
+    3333.3333333333335,
+    3333.3333333333335,
+    3333.3333333333335,
+    3333.3333333333335,
+    3333.3333333333335,
+    3333.3333333333335,
+    3333.3333333333335,
+    3333.3333333333335,
+    3333.3333333333335,
+    3333.3333333333335,
+    5000.0,
+    5000.0,
+    5000.0]
+}

annotator/lama/saicinpainting/training/losses/distance_weighting.py ADDED Viewed

	@@ -0,0 +1,126 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchvision
+from annotator.lama.saicinpainting.training.losses.perceptual import IMAGENET_STD, IMAGENET_MEAN
+def dummy_distance_weighter(real_img, pred_img, mask):
+    return mask
+def get_gauss_kernel(kernel_size, width_factor=1):
+    coords = torch.stack(torch.meshgrid(torch.arange(kernel_size),
+                                        torch.arange(kernel_size)),
+                         dim=0).float()
+    diff = torch.exp(-((coords - kernel_size // 2) ** 2).sum(0) / kernel_size / width_factor)
+    diff /= diff.sum()
+    return diff
+class BlurMask(nn.Module):
+    def __init__(self, kernel_size=5, width_factor=1):
+        super().__init__()
+        self.filter = nn.Conv2d(1, 1, kernel_size, padding=kernel_size // 2, padding_mode='replicate', bias=False)
+        self.filter.weight.data.copy_(get_gauss_kernel(kernel_size, width_factor=width_factor))
+    def forward(self, real_img, pred_img, mask):
+        with torch.no_grad():
+            result = self.filter(mask) * mask
+            return result
+class EmulatedEDTMask(nn.Module):
+    def __init__(self, dilate_kernel_size=5, blur_kernel_size=5, width_factor=1):
+        super().__init__()
+        self.dilate_filter = nn.Conv2d(1, 1, dilate_kernel_size, padding=dilate_kernel_size// 2, padding_mode='replicate',
+                                       bias=False)
+        self.dilate_filter.weight.data.copy_(torch.ones(1, 1, dilate_kernel_size, dilate_kernel_size, dtype=torch.float))
+        self.blur_filter = nn.Conv2d(1, 1, blur_kernel_size, padding=blur_kernel_size // 2, padding_mode='replicate', bias=False)
+        self.blur_filter.weight.data.copy_(get_gauss_kernel(blur_kernel_size, width_factor=width_factor))
+    def forward(self, real_img, pred_img, mask):
+        with torch.no_grad():
+            known_mask = 1 - mask
+            dilated_known_mask = (self.dilate_filter(known_mask) > 1).float()
+            result = self.blur_filter(1 - dilated_known_mask) * mask
+            return result
+class PropagatePerceptualSim(nn.Module):
+    def __init__(self, level=2, max_iters=10, temperature=500, erode_mask_size=3):
+        super().__init__()
+        vgg = torchvision.models.vgg19(pretrained=True).features
+        vgg_avg_pooling = []
+        for weights in vgg.parameters():
+            weights.requires_grad = False
+        cur_level_i = 0
+        for module in vgg.modules():
+            if module.__class__.__name__ == 'Sequential':
+                continue
+            elif module.__class__.__name__ == 'MaxPool2d':
+                vgg_avg_pooling.append(nn.AvgPool2d(kernel_size=2, stride=2, padding=0))
+            else:
+                vgg_avg_pooling.append(module)
+                if module.__class__.__name__ == 'ReLU':
+                    cur_level_i += 1
+                if cur_level_i == level:
+                    break
+        self.features = nn.Sequential(*vgg_avg_pooling)
+        self.max_iters = max_iters
+        self.temperature = temperature
+        self.do_erode = erode_mask_size > 0
+        if self.do_erode:
+            self.erode_mask = nn.Conv2d(1, 1, erode_mask_size, padding=erode_mask_size // 2, bias=False)
+            self.erode_mask.weight.data.fill_(1)
+    def forward(self, real_img, pred_img, mask):
+        with torch.no_grad():
+            real_img = (real_img - IMAGENET_MEAN.to(real_img)) / IMAGENET_STD.to(real_img)
+            real_feats = self.features(real_img)
+            vertical_sim = torch.exp(-(real_feats[:, :, 1:] - real_feats[:, :, :-1]).pow(2).sum(1, keepdim=True)
+                                     / self.temperature)
+            horizontal_sim = torch.exp(-(real_feats[:, :, :, 1:] - real_feats[:, :, :, :-1]).pow(2).sum(1, keepdim=True)
+                                       / self.temperature)
+            mask_scaled = F.interpolate(mask, size=real_feats.shape[-2:], mode='bilinear', align_corners=False)
+            if self.do_erode:
+                mask_scaled = (self.erode_mask(mask_scaled) > 1).float()
+            cur_knowness = 1 - mask_scaled
+            for iter_i in range(self.max_iters):
+                new_top_knowness = F.pad(cur_knowness[:, :, :-1] * vertical_sim, (0, 0, 1, 0), mode='replicate')
+                new_bottom_knowness = F.pad(cur_knowness[:, :, 1:] * vertical_sim, (0, 0, 0, 1), mode='replicate')
+                new_left_knowness = F.pad(cur_knowness[:, :, :, :-1] * horizontal_sim, (1, 0, 0, 0), mode='replicate')
+                new_right_knowness = F.pad(cur_knowness[:, :, :, 1:] * horizontal_sim, (0, 1, 0, 0), mode='replicate')
+                new_knowness = torch.stack([new_top_knowness, new_bottom_knowness,
+                                            new_left_knowness, new_right_knowness],
+                                           dim=0).max(0).values
+                cur_knowness = torch.max(cur_knowness, new_knowness)
+            cur_knowness = F.interpolate(cur_knowness, size=mask.shape[-2:], mode='bilinear')
+            result = torch.min(mask, 1 - cur_knowness)
+            return result
+def make_mask_distance_weighter(kind='none', **kwargs):
+    if kind == 'none':
+        return dummy_distance_weighter
+    if kind == 'blur':
+        return BlurMask(**kwargs)
+    if kind == 'edt':
+        return EmulatedEDTMask(**kwargs)
+    if kind == 'pps':
+        return PropagatePerceptualSim(**kwargs)
+    raise ValueError(f'Unknown mask distance weighter kind {kind}')

annotator/lama/saicinpainting/training/losses/feature_matching.py ADDED Viewed

	@@ -0,0 +1,33 @@

+from typing import List
+import torch
+import torch.nn.functional as F
+def masked_l2_loss(pred, target, mask, weight_known, weight_missing):
+    per_pixel_l2 = F.mse_loss(pred, target, reduction='none')
+    pixel_weights = mask * weight_missing + (1 - mask) * weight_known
+    return (pixel_weights * per_pixel_l2).mean()
+def masked_l1_loss(pred, target, mask, weight_known, weight_missing):
+    per_pixel_l1 = F.l1_loss(pred, target, reduction='none')
+    pixel_weights = mask * weight_missing + (1 - mask) * weight_known
+    return (pixel_weights * per_pixel_l1).mean()
+def feature_matching_loss(fake_features: List[torch.Tensor], target_features: List[torch.Tensor], mask=None):
+    if mask is None:
+        res = torch.stack([F.mse_loss(fake_feat, target_feat)
+                           for fake_feat, target_feat in zip(fake_features, target_features)]).mean()
+    else:
+        res = 0
+        norm = 0
+        for fake_feat, target_feat in zip(fake_features, target_features):
+            cur_mask = F.interpolate(mask, size=fake_feat.shape[-2:], mode='bilinear', align_corners=False)
+            error_weights = 1 - cur_mask
+            cur_val = ((fake_feat - target_feat).pow(2) * error_weights).mean()
+            res = res + cur_val
+            norm += 1
+        res = res / norm
+    return res

annotator/lama/saicinpainting/training/losses/perceptual.py ADDED Viewed

	@@ -0,0 +1,113 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torchvision
+# from models.ade20k import ModelBuilder
+from annotator.lama.saicinpainting.utils import check_and_warn_input_range
+IMAGENET_MEAN = torch.FloatTensor([0.485, 0.456, 0.406])[None, :, None, None]
+IMAGENET_STD = torch.FloatTensor([0.229, 0.224, 0.225])[None, :, None, None]
+class PerceptualLoss(nn.Module):
+    def __init__(self, normalize_inputs=True):
+        super(PerceptualLoss, self).__init__()
+        self.normalize_inputs = normalize_inputs
+        self.mean_ = IMAGENET_MEAN
+        self.std_ = IMAGENET_STD
+        vgg = torchvision.models.vgg19(pretrained=True).features
+        vgg_avg_pooling = []
+        for weights in vgg.parameters():
+            weights.requires_grad = False
+        for module in vgg.modules():
+            if module.__class__.__name__ == 'Sequential':
+                continue
+            elif module.__class__.__name__ == 'MaxPool2d':
+                vgg_avg_pooling.append(nn.AvgPool2d(kernel_size=2, stride=2, padding=0))
+            else:
+                vgg_avg_pooling.append(module)
+        self.vgg = nn.Sequential(*vgg_avg_pooling)
+    def do_normalize_inputs(self, x):
+        return (x - self.mean_.to(x.device)) / self.std_.to(x.device)
+    def partial_losses(self, input, target, mask=None):
+        check_and_warn_input_range(target, 0, 1, 'PerceptualLoss target in partial_losses')
+        # we expect input and target to be in [0, 1] range
+        losses = []
+        if self.normalize_inputs:
+            features_input = self.do_normalize_inputs(input)
+            features_target = self.do_normalize_inputs(target)
+        else:
+            features_input = input
+            features_target = target
+        for layer in self.vgg[:30]:
+            features_input = layer(features_input)
+            features_target = layer(features_target)
+            if layer.__class__.__name__ == 'ReLU':
+                loss = F.mse_loss(features_input, features_target, reduction='none')
+                if mask is not None:
+                    cur_mask = F.interpolate(mask, size=features_input.shape[-2:],
+                                             mode='bilinear', align_corners=False)
+                    loss = loss * (1 - cur_mask)
+                loss = loss.mean(dim=tuple(range(1, len(loss.shape))))
+                losses.append(loss)
+        return losses
+    def forward(self, input, target, mask=None):
+        losses = self.partial_losses(input, target, mask=mask)
+        return torch.stack(losses).sum(dim=0)
+    def get_global_features(self, input):
+        check_and_warn_input_range(input, 0, 1, 'PerceptualLoss input in get_global_features')
+        if self.normalize_inputs:
+            features_input = self.do_normalize_inputs(input)
+        else:
+            features_input = input
+        features_input = self.vgg(features_input)
+        return features_input
+class ResNetPL(nn.Module):
+    def __init__(self, weight=1,
+                 weights_path=None, arch_encoder='resnet50dilated', segmentation=True):
+        super().__init__()
+        self.impl = ModelBuilder.get_encoder(weights_path=weights_path,
+                                             arch_encoder=arch_encoder,
+                                             arch_decoder='ppm_deepsup',
+                                             fc_dim=2048,
+                                             segmentation=segmentation)
+        self.impl.eval()
+        for w in self.impl.parameters():
+            w.requires_grad_(False)
+        self.weight = weight
+    def forward(self, pred, target):
+        pred = (pred - IMAGENET_MEAN.to(pred)) / IMAGENET_STD.to(pred)
+        target = (target - IMAGENET_MEAN.to(target)) / IMAGENET_STD.to(target)
+        pred_feats = self.impl(pred, return_feature_maps=True)
+        target_feats = self.impl(target, return_feature_maps=True)
+        result = torch.stack([F.mse_loss(cur_pred, cur_target)
+                              for cur_pred, cur_target
+                              in zip(pred_feats, target_feats)]).sum() * self.weight
+        return result

annotator/lama/saicinpainting/training/losses/segmentation.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .constants import weights as constant_weights
+class CrossEntropy2d(nn.Module):
+    def __init__(self, reduction="mean", ignore_label=255, weights=None, *args, **kwargs):
+        """
+        weight (Tensor, optional): a manual rescaling weight given to each class.
+            If given, has to be a Tensor of size "nclasses"
+        """
+        super(CrossEntropy2d, self).__init__()
+        self.reduction = reduction
+        self.ignore_label = ignore_label
+        self.weights = weights
+        if self.weights is not None:
+            device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+            self.weights = torch.FloatTensor(constant_weights[weights]).to(device)
+    def forward(self, predict, target):
+        """
+            Args:
+                predict:(n, c, h, w)
+                target:(n, 1, h, w)
+        """
+        target = target.long()
+        assert not target.requires_grad
+        assert predict.dim() == 4, "{0}".format(predict.size())
+        assert target.dim() == 4, "{0}".format(target.size())
+        assert predict.size(0) == target.size(0), "{0} vs {1} ".format(predict.size(0), target.size(0))
+        assert target.size(1) == 1, "{0}".format(target.size(1))
+        assert predict.size(2) == target.size(2), "{0} vs {1} ".format(predict.size(2), target.size(2))
+        assert predict.size(3) == target.size(3), "{0} vs {1} ".format(predict.size(3), target.size(3))
+        target = target.squeeze(1)
+        n, c, h, w = predict.size()
+        target_mask = (target >= 0) * (target != self.ignore_label)
+        target = target[target_mask]
+        predict = predict.transpose(1, 2).transpose(2, 3).contiguous()
+        predict = predict[target_mask.view(n, h, w, 1).repeat(1, 1, 1, c)].view(-1, c)
+        loss = F.cross_entropy(predict, target, weight=self.weights, reduction=self.reduction)
+        return loss

annotator/lama/saicinpainting/training/losses/style_loss.py ADDED Viewed

	@@ -0,0 +1,155 @@

+import torch
+import torch.nn as nn
+import torchvision.models as models
+class PerceptualLoss(nn.Module):
+    r"""
+    Perceptual loss, VGG-based
+    https://arxiv.org/abs/1603.08155
+    https://github.com/dxyang/StyleTransfer/blob/master/utils.py
+    """
+    def __init__(self, weights=[1.0, 1.0, 1.0, 1.0, 1.0]):
+        super(PerceptualLoss, self).__init__()
+        self.add_module('vgg', VGG19())
+        self.criterion = torch.nn.L1Loss()
+        self.weights = weights
+    def __call__(self, x, y):
+        # Compute features
+        x_vgg, y_vgg = self.vgg(x), self.vgg(y)
+        content_loss = 0.0
+        content_loss += self.weights[0] * self.criterion(x_vgg['relu1_1'], y_vgg['relu1_1'])
+        content_loss += self.weights[1] * self.criterion(x_vgg['relu2_1'], y_vgg['relu2_1'])
+        content_loss += self.weights[2] * self.criterion(x_vgg['relu3_1'], y_vgg['relu3_1'])
+        content_loss += self.weights[3] * self.criterion(x_vgg['relu4_1'], y_vgg['relu4_1'])
+        content_loss += self.weights[4] * self.criterion(x_vgg['relu5_1'], y_vgg['relu5_1'])
+        return content_loss
+class VGG19(torch.nn.Module):
+    def __init__(self):
+        super(VGG19, self).__init__()
+        features = models.vgg19(pretrained=True).features
+        self.relu1_1 = torch.nn.Sequential()
+        self.relu1_2 = torch.nn.Sequential()
+        self.relu2_1 = torch.nn.Sequential()
+        self.relu2_2 = torch.nn.Sequential()
+        self.relu3_1 = torch.nn.Sequential()
+        self.relu3_2 = torch.nn.Sequential()
+        self.relu3_3 = torch.nn.Sequential()
+        self.relu3_4 = torch.nn.Sequential()
+        self.relu4_1 = torch.nn.Sequential()
+        self.relu4_2 = torch.nn.Sequential()
+        self.relu4_3 = torch.nn.Sequential()
+        self.relu4_4 = torch.nn.Sequential()
+        self.relu5_1 = torch.nn.Sequential()
+        self.relu5_2 = torch.nn.Sequential()
+        self.relu5_3 = torch.nn.Sequential()
+        self.relu5_4 = torch.nn.Sequential()
+        for x in range(2):
+            self.relu1_1.add_module(str(x), features[x])
+        for x in range(2, 4):
+            self.relu1_2.add_module(str(x), features[x])
+        for x in range(4, 7):
+            self.relu2_1.add_module(str(x), features[x])
+        for x in range(7, 9):
+            self.relu2_2.add_module(str(x), features[x])
+        for x in range(9, 12):
+            self.relu3_1.add_module(str(x), features[x])
+        for x in range(12, 14):
+            self.relu3_2.add_module(str(x), features[x])
+        for x in range(14, 16):
+            self.relu3_2.add_module(str(x), features[x])
+        for x in range(16, 18):
+            self.relu3_4.add_module(str(x), features[x])
+        for x in range(18, 21):
+            self.relu4_1.add_module(str(x), features[x])
+        for x in range(21, 23):
+            self.relu4_2.add_module(str(x), features[x])
+        for x in range(23, 25):
+            self.relu4_3.add_module(str(x), features[x])
+        for x in range(25, 27):
+            self.relu4_4.add_module(str(x), features[x])
+        for x in range(27, 30):
+            self.relu5_1.add_module(str(x), features[x])
+        for x in range(30, 32):
+            self.relu5_2.add_module(str(x), features[x])
+        for x in range(32, 34):
+            self.relu5_3.add_module(str(x), features[x])
+        for x in range(34, 36):
+            self.relu5_4.add_module(str(x), features[x])
+        # don't need the gradients, just want the features
+        for param in self.parameters():
+            param.requires_grad = False
+    def forward(self, x):
+        relu1_1 = self.relu1_1(x)
+        relu1_2 = self.relu1_2(relu1_1)
+        relu2_1 = self.relu2_1(relu1_2)
+        relu2_2 = self.relu2_2(relu2_1)
+        relu3_1 = self.relu3_1(relu2_2)
+        relu3_2 = self.relu3_2(relu3_1)
+        relu3_3 = self.relu3_3(relu3_2)
+        relu3_4 = self.relu3_4(relu3_3)
+        relu4_1 = self.relu4_1(relu3_4)
+        relu4_2 = self.relu4_2(relu4_1)
+        relu4_3 = self.relu4_3(relu4_2)
+        relu4_4 = self.relu4_4(relu4_3)
+        relu5_1 = self.relu5_1(relu4_4)
+        relu5_2 = self.relu5_2(relu5_1)
+        relu5_3 = self.relu5_3(relu5_2)
+        relu5_4 = self.relu5_4(relu5_3)
+        out = {
+            'relu1_1': relu1_1,
+            'relu1_2': relu1_2,
+            'relu2_1': relu2_1,
+            'relu2_2': relu2_2,
+            'relu3_1': relu3_1,
+            'relu3_2': relu3_2,
+            'relu3_3': relu3_3,
+            'relu3_4': relu3_4,
+            'relu4_1': relu4_1,
+            'relu4_2': relu4_2,
+            'relu4_3': relu4_3,
+            'relu4_4': relu4_4,
+            'relu5_1': relu5_1,
+            'relu5_2': relu5_2,
+            'relu5_3': relu5_3,
+            'relu5_4': relu5_4,
+        }
+        return out

annotator/lama/saicinpainting/training/modules/__init__.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import logging
+from annotator.lama.saicinpainting.training.modules.ffc import FFCResNetGenerator
+from annotator.lama.saicinpainting.training.modules.pix2pixhd import GlobalGenerator, MultiDilatedGlobalGenerator, \
+    NLayerDiscriminator, MultidilatedNLayerDiscriminator
+def make_generator(config, kind, **kwargs):
+    logging.info(f'Make generator {kind}')
+    if kind == 'pix2pixhd_multidilated':
+        return MultiDilatedGlobalGenerator(**kwargs)
+    if kind == 'pix2pixhd_global':
+        return GlobalGenerator(**kwargs)
+    if kind == 'ffc_resnet':
+        return FFCResNetGenerator(**kwargs)
+    raise ValueError(f'Unknown generator kind {kind}')
+def make_discriminator(kind, **kwargs):
+    logging.info(f'Make discriminator {kind}')
+    if kind == 'pix2pixhd_nlayer_multidilated':
+        return MultidilatedNLayerDiscriminator(**kwargs)
+    if kind == 'pix2pixhd_nlayer':
+        return NLayerDiscriminator(**kwargs)
+    raise ValueError(f'Unknown discriminator kind {kind}')

annotator/lama/saicinpainting/training/modules/base.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import abc
+from typing import Tuple, List
+import torch
+import torch.nn as nn
+from annotator.lama.saicinpainting.training.modules.depthwise_sep_conv import DepthWiseSeperableConv
+from annotator.lama.saicinpainting.training.modules.multidilated_conv import MultidilatedConv
+class BaseDiscriminator(nn.Module):
+    @abc.abstractmethod
+    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, List[torch.Tensor]]:
+        """
+        Predict scores and get intermediate activations. Useful for feature matching loss
+        :return tuple (scores, list of intermediate activations)
+        """
+        raise NotImplemented()
+def get_conv_block_ctor(kind='default'):
+    if not isinstance(kind, str):
+        return kind
+    if kind == 'default':
+        return nn.Conv2d
+    if kind == 'depthwise':
+        return DepthWiseSeperableConv
+    if kind == 'multidilated':
+        return MultidilatedConv
+    raise ValueError(f'Unknown convolutional block kind {kind}')
+def get_norm_layer(kind='bn'):
+    if not isinstance(kind, str):
+        return kind
+    if kind == 'bn':
+        return nn.BatchNorm2d
+    if kind == 'in':
+        return nn.InstanceNorm2d
+    raise ValueError(f'Unknown norm block kind {kind}')
+def get_activation(kind='tanh'):
+    if kind == 'tanh':
+        return nn.Tanh()
+    if kind == 'sigmoid':
+        return nn.Sigmoid()
+    if kind is False:
+        return nn.Identity()
+    raise ValueError(f'Unknown activation kind {kind}')
+class SimpleMultiStepGenerator(nn.Module):
+    def __init__(self, steps: List[nn.Module]):
+        super().__init__()
+        self.steps = nn.ModuleList(steps)
+    def forward(self, x):
+        cur_in = x
+        outs = []
+        for step in self.steps:
+            cur_out = step(cur_in)
+            outs.append(cur_out)
+            cur_in = torch.cat((cur_in, cur_out), dim=1)
+        return torch.cat(outs[::-1], dim=1)
+def deconv_factory(kind, ngf, mult, norm_layer, activation, max_features):
+    if kind == 'convtranspose':
+        return [nn.ConvTranspose2d(min(max_features, ngf * mult),
+                    min(max_features, int(ngf * mult / 2)),
+                    kernel_size=3, stride=2, padding=1, output_padding=1),
+                    norm_layer(min(max_features, int(ngf * mult / 2))), activation]
+    elif kind == 'bilinear':
+        return [nn.Upsample(scale_factor=2, mode='bilinear'),
+                DepthWiseSeperableConv(min(max_features, ngf * mult),
+                    min(max_features, int(ngf * mult / 2)),
+                    kernel_size=3, stride=1, padding=1),
+                norm_layer(min(max_features, int(ngf * mult / 2))), activation]
+    else:
+        raise Exception(f"Invalid deconv kind: {kind}")

annotator/lama/saicinpainting/training/modules/depthwise_sep_conv.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import torch
+import torch.nn as nn
+class DepthWiseSeperableConv(nn.Module):
+    def __init__(self, in_dim, out_dim, *args, **kwargs):
+        super().__init__()
+        if 'groups' in kwargs:
+            # ignoring groups for Depthwise Sep Conv
+            del kwargs['groups']
+        self.depthwise = nn.Conv2d(in_dim, in_dim, *args, groups=in_dim, **kwargs)
+        self.pointwise = nn.Conv2d(in_dim, out_dim, kernel_size=1)
+    def forward(self, x):
+        out = self.depthwise(x)
+        out = self.pointwise(out)
+        return out

annotator/lama/saicinpainting/training/modules/fake_fakes.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import torch
+from kornia import SamplePadding
+from kornia.augmentation import RandomAffine, CenterCrop
+class FakeFakesGenerator:
+    def __init__(self, aug_proba=0.5, img_aug_degree=30, img_aug_translate=0.2):
+        self.grad_aug = RandomAffine(degrees=360,
+                                     translate=0.2,
+                                     padding_mode=SamplePadding.REFLECTION,
+                                     keepdim=False,
+                                     p=1)
+        self.img_aug = RandomAffine(degrees=img_aug_degree,
+                                    translate=img_aug_translate,
+                                    padding_mode=SamplePadding.REFLECTION,
+                                    keepdim=True,
+                                    p=1)
+        self.aug_proba = aug_proba
+    def __call__(self, input_images, masks):
+        blend_masks = self._fill_masks_with_gradient(masks)
+        blend_target = self._make_blend_target(input_images)
+        result = input_images * (1 - blend_masks) + blend_target * blend_masks
+        return result, blend_masks
+    def _make_blend_target(self, input_images):
+        batch_size = input_images.shape[0]
+        permuted = input_images[torch.randperm(batch_size)]
+        augmented = self.img_aug(input_images)
+        is_aug = (torch.rand(batch_size, device=input_images.device)[:, None, None, None] < self.aug_proba).float()
+        result = augmented * is_aug + permuted * (1 - is_aug)
+        return result
+    def _fill_masks_with_gradient(self, masks):
+        batch_size, _, height, width = masks.shape
+        grad = torch.linspace(0, 1, steps=width * 2, device=masks.device, dtype=masks.dtype) \
+            .view(1, 1, 1, -1).expand(batch_size, 1, height * 2, width * 2)
+        grad = self.grad_aug(grad)
+        grad = CenterCrop((height, width))(grad)
+        grad *= masks
+        grad_for_min = grad + (1 - masks) * 10
+        grad -= grad_for_min.view(batch_size, -1).min(-1).values[:, None, None, None]
+        grad /= grad.view(batch_size, -1).max(-1).values[:, None, None, None] + 1e-6
+        grad.clamp_(min=0, max=1)
+        return grad

annotator/lama/saicinpainting/training/modules/ffc.py ADDED Viewed

	@@ -0,0 +1,485 @@

+# Fast Fourier Convolution NeurIPS 2020
+# original implementation https://github.com/pkumivision/FFC/blob/main/model_zoo/ffc.py
+# paper https://proceedings.neurips.cc/paper/2020/file/2fd5d41ec6cfab47e32164d5624269b1-Paper.pdf
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from annotator.lama.saicinpainting.training.modules.base import get_activation, BaseDiscriminator
+from annotator.lama.saicinpainting.training.modules.spatial_transform import LearnableSpatialTransformWrapper
+from annotator.lama.saicinpainting.training.modules.squeeze_excitation import SELayer
+from annotator.lama.saicinpainting.utils import get_shape
+class FFCSE_block(nn.Module):
+    def __init__(self, channels, ratio_g):
+        super(FFCSE_block, self).__init__()
+        in_cg = int(channels * ratio_g)
+        in_cl = channels - in_cg
+        r = 16
+        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+        self.conv1 = nn.Conv2d(channels, channels // r,
+                               kernel_size=1, bias=True)
+        self.relu1 = nn.ReLU(inplace=True)
+        self.conv_a2l = None if in_cl == 0 else nn.Conv2d(
+            channels // r, in_cl, kernel_size=1, bias=True)
+        self.conv_a2g = None if in_cg == 0 else nn.Conv2d(
+            channels // r, in_cg, kernel_size=1, bias=True)
+        self.sigmoid = nn.Sigmoid()
+    def forward(self, x):
+        x = x if type(x) is tuple else (x, 0)
+        id_l, id_g = x
+        x = id_l if type(id_g) is int else torch.cat([id_l, id_g], dim=1)
+        x = self.avgpool(x)
+        x = self.relu1(self.conv1(x))
+        x_l = 0 if self.conv_a2l is None else id_l * \
+            self.sigmoid(self.conv_a2l(x))
+        x_g = 0 if self.conv_a2g is None else id_g * \
+            self.sigmoid(self.conv_a2g(x))
+        return x_l, x_g
+class FourierUnit(nn.Module):
+    def __init__(self, in_channels, out_channels, groups=1, spatial_scale_factor=None, spatial_scale_mode='bilinear',
+                 spectral_pos_encoding=False, use_se=False, se_kwargs=None, ffc3d=False, fft_norm='ortho'):
+        # bn_layer not used
+        super(FourierUnit, self).__init__()
+        self.groups = groups
+        self.conv_layer = torch.nn.Conv2d(in_channels=in_channels * 2 + (2 if spectral_pos_encoding else 0),
+                                          out_channels=out_channels * 2,
+                                          kernel_size=1, stride=1, padding=0, groups=self.groups, bias=False)
+        self.bn = torch.nn.BatchNorm2d(out_channels * 2)
+        self.relu = torch.nn.ReLU(inplace=True)
+        # squeeze and excitation block
+        self.use_se = use_se
+        if use_se:
+            if se_kwargs is None:
+                se_kwargs = {}
+            self.se = SELayer(self.conv_layer.in_channels, **se_kwargs)
+        self.spatial_scale_factor = spatial_scale_factor
+        self.spatial_scale_mode = spatial_scale_mode
+        self.spectral_pos_encoding = spectral_pos_encoding
+        self.ffc3d = ffc3d
+        self.fft_norm = fft_norm
+    def forward(self, x):
+        batch = x.shape[0]
+        if self.spatial_scale_factor is not None:
+            orig_size = x.shape[-2:]
+            x = F.interpolate(x, scale_factor=self.spatial_scale_factor, mode=self.spatial_scale_mode, align_corners=False)
+        r_size = x.size()
+        # (batch, c, h, w/2+1, 2)
+        fft_dim = (-3, -2, -1) if self.ffc3d else (-2, -1)
+        ffted = torch.fft.rfftn(x, dim=fft_dim, norm=self.fft_norm)
+        ffted = torch.stack((ffted.real, ffted.imag), dim=-1)
+        ffted = ffted.permute(0, 1, 4, 2, 3).contiguous()  # (batch, c, 2, h, w/2+1)
+        ffted = ffted.view((batch, -1,) + ffted.size()[3:])
+        if self.spectral_pos_encoding:
+            height, width = ffted.shape[-2:]
+            coords_vert = torch.linspace(0, 1, height)[None, None, :, None].expand(batch, 1, height, width).to(ffted)
+            coords_hor = torch.linspace(0, 1, width)[None, None, None, :].expand(batch, 1, height, width).to(ffted)
+            ffted = torch.cat((coords_vert, coords_hor, ffted), dim=1)
+        if self.use_se:
+            ffted = self.se(ffted)
+        ffted = self.conv_layer(ffted)  # (batch, c*2, h, w/2+1)
+        ffted = self.relu(self.bn(ffted))
+        ffted = ffted.view((batch, -1, 2,) + ffted.size()[2:]).permute(
+            0, 1, 3, 4, 2).contiguous()  # (batch,c, t, h, w/2+1, 2)
+        ffted = torch.complex(ffted[..., 0], ffted[..., 1])
+        ifft_shape_slice = x.shape[-3:] if self.ffc3d else x.shape[-2:]
+        output = torch.fft.irfftn(ffted, s=ifft_shape_slice, dim=fft_dim, norm=self.fft_norm)
+        if self.spatial_scale_factor is not None:
+            output = F.interpolate(output, size=orig_size, mode=self.spatial_scale_mode, align_corners=False)
+        return output
+class SeparableFourierUnit(nn.Module):
+    def __init__(self, in_channels, out_channels, groups=1, kernel_size=3):
+        # bn_layer not used
+        super(SeparableFourierUnit, self).__init__()
+        self.groups = groups
+        row_out_channels = out_channels // 2
+        col_out_channels = out_channels - row_out_channels
+        self.row_conv = torch.nn.Conv2d(in_channels=in_channels * 2,
+                                        out_channels=row_out_channels * 2,
+                                        kernel_size=(kernel_size, 1),  # kernel size is always like this, but the data will be transposed
+                                        stride=1, padding=(kernel_size // 2, 0),
+                                        padding_mode='reflect',
+                                        groups=self.groups, bias=False)
+        self.col_conv = torch.nn.Conv2d(in_channels=in_channels * 2,
+                                        out_channels=col_out_channels * 2,
+                                        kernel_size=(kernel_size, 1),  # kernel size is always like this, but the data will be transposed
+                                        stride=1, padding=(kernel_size // 2, 0),
+                                        padding_mode='reflect',
+                                        groups=self.groups, bias=False)
+        self.row_bn = torch.nn.BatchNorm2d(row_out_channels * 2)
+        self.col_bn = torch.nn.BatchNorm2d(col_out_channels * 2)
+        self.relu = torch.nn.ReLU(inplace=True)
+    def process_branch(self, x, conv, bn):
+        batch = x.shape[0]
+        r_size = x.size()
+        # (batch, c, h, w/2+1, 2)
+        ffted = torch.fft.rfft(x, norm="ortho")
+        ffted = torch.stack((ffted.real, ffted.imag), dim=-1)
+        ffted = ffted.permute(0, 1, 4, 2, 3).contiguous()  # (batch, c, 2, h, w/2+1)
+        ffted = ffted.view((batch, -1,) + ffted.size()[3:])
+        ffted = self.relu(bn(conv(ffted)))
+        ffted = ffted.view((batch, -1, 2,) + ffted.size()[2:]).permute(
+            0, 1, 3, 4, 2).contiguous()  # (batch,c, t, h, w/2+1, 2)
+        ffted = torch.complex(ffted[..., 0], ffted[..., 1])
+        output = torch.fft.irfft(ffted, s=x.shape[-1:], norm="ortho")
+        return output
+    def forward(self, x):
+        rowwise = self.process_branch(x, self.row_conv, self.row_bn)
+        colwise = self.process_branch(x.permute(0, 1, 3, 2), self.col_conv, self.col_bn).permute(0, 1, 3, 2)
+        out = torch.cat((rowwise, colwise), dim=1)
+        return out
+class SpectralTransform(nn.Module):
+    def __init__(self, in_channels, out_channels, stride=1, groups=1, enable_lfu=True, separable_fu=False, **fu_kwargs):
+        # bn_layer not used
+        super(SpectralTransform, self).__init__()
+        self.enable_lfu = enable_lfu
+        if stride == 2:
+            self.downsample = nn.AvgPool2d(kernel_size=(2, 2), stride=2)
+        else:
+            self.downsample = nn.Identity()
+        self.stride = stride
+        self.conv1 = nn.Sequential(
+            nn.Conv2d(in_channels, out_channels //
+                      2, kernel_size=1, groups=groups, bias=False),
+            nn.BatchNorm2d(out_channels // 2),
+            nn.ReLU(inplace=True)
+        )
+        fu_class = SeparableFourierUnit if separable_fu else FourierUnit
+        self.fu = fu_class(
+            out_channels // 2, out_channels // 2, groups, **fu_kwargs)
+        if self.enable_lfu:
+            self.lfu = fu_class(
+                out_channels // 2, out_channels // 2, groups)
+        self.conv2 = torch.nn.Conv2d(
+            out_channels // 2, out_channels, kernel_size=1, groups=groups, bias=False)
+    def forward(self, x):
+        x = self.downsample(x)
+        x = self.conv1(x)
+        output = self.fu(x)
+        if self.enable_lfu:
+            n, c, h, w = x.shape
+            split_no = 2
+            split_s = h // split_no
+            xs = torch.cat(torch.split(
+                x[:, :c // 4], split_s, dim=-2), dim=1).contiguous()
+            xs = torch.cat(torch.split(xs, split_s, dim=-1),
+                           dim=1).contiguous()
+            xs = self.lfu(xs)
+            xs = xs.repeat(1, 1, split_no, split_no).contiguous()
+        else:
+            xs = 0
+        output = self.conv2(x + output + xs)
+        return output
+class FFC(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size,
+                 ratio_gin, ratio_gout, stride=1, padding=0,
+                 dilation=1, groups=1, bias=False, enable_lfu=True,
+                 padding_type='reflect', gated=False, **spectral_kwargs):
+        super(FFC, self).__init__()
+        assert stride == 1 or stride == 2, "Stride should be 1 or 2."
+        self.stride = stride
+        in_cg = int(in_channels * ratio_gin)
+        in_cl = in_channels - in_cg
+        out_cg = int(out_channels * ratio_gout)
+        out_cl = out_channels - out_cg
+        #groups_g = 1 if groups == 1 else int(groups * ratio_gout)
+        #groups_l = 1 if groups == 1 else groups - groups_g
+        self.ratio_gin = ratio_gin
+        self.ratio_gout = ratio_gout
+        self.global_in_num = in_cg
+        module = nn.Identity if in_cl == 0 or out_cl == 0 else nn.Conv2d
+        self.convl2l = module(in_cl, out_cl, kernel_size,
+                              stride, padding, dilation, groups, bias, padding_mode=padding_type)
+        module = nn.Identity if in_cl == 0 or out_cg == 0 else nn.Conv2d
+        self.convl2g = module(in_cl, out_cg, kernel_size,
+                              stride, padding, dilation, groups, bias, padding_mode=padding_type)
+        module = nn.Identity if in_cg == 0 or out_cl == 0 else nn.Conv2d
+        self.convg2l = module(in_cg, out_cl, kernel_size,
+                              stride, padding, dilation, groups, bias, padding_mode=padding_type)
+        module = nn.Identity if in_cg == 0 or out_cg == 0 else SpectralTransform
+        self.convg2g = module(
+            in_cg, out_cg, stride, 1 if groups == 1 else groups // 2, enable_lfu, **spectral_kwargs)
+        self.gated = gated
+        module = nn.Identity if in_cg == 0 or out_cl == 0 or not self.gated else nn.Conv2d
+        self.gate = module(in_channels, 2, 1)
+    def forward(self, x):
+        x_l, x_g = x if type(x) is tuple else (x, 0)
+        out_xl, out_xg = 0, 0
+        if self.gated:
+            total_input_parts = [x_l]
+            if torch.is_tensor(x_g):
+                total_input_parts.append(x_g)
+            total_input = torch.cat(total_input_parts, dim=1)
+            gates = torch.sigmoid(self.gate(total_input))
+            g2l_gate, l2g_gate = gates.chunk(2, dim=1)
+        else:
+            g2l_gate, l2g_gate = 1, 1
+        if self.ratio_gout != 1:
+            out_xl = self.convl2l(x_l) + self.convg2l(x_g) * g2l_gate
+        if self.ratio_gout != 0:
+            out_xg = self.convl2g(x_l) * l2g_gate + self.convg2g(x_g)
+        return out_xl, out_xg
+class FFC_BN_ACT(nn.Module):
+    def __init__(self, in_channels, out_channels,
+                 kernel_size, ratio_gin, ratio_gout,
+                 stride=1, padding=0, dilation=1, groups=1, bias=False,
+                 norm_layer=nn.BatchNorm2d, activation_layer=nn.Identity,
+                 padding_type='reflect',
+                 enable_lfu=True, **kwargs):
+        super(FFC_BN_ACT, self).__init__()
+        self.ffc = FFC(in_channels, out_channels, kernel_size,
+                       ratio_gin, ratio_gout, stride, padding, dilation,
+                       groups, bias, enable_lfu, padding_type=padding_type, **kwargs)
+        lnorm = nn.Identity if ratio_gout == 1 else norm_layer
+        gnorm = nn.Identity if ratio_gout == 0 else norm_layer
+        global_channels = int(out_channels * ratio_gout)
+        self.bn_l = lnorm(out_channels - global_channels)
+        self.bn_g = gnorm(global_channels)
+        lact = nn.Identity if ratio_gout == 1 else activation_layer
+        gact = nn.Identity if ratio_gout == 0 else activation_layer
+        self.act_l = lact(inplace=True)
+        self.act_g = gact(inplace=True)
+    def forward(self, x):
+        x_l, x_g = self.ffc(x)
+        x_l = self.act_l(self.bn_l(x_l))
+        x_g = self.act_g(self.bn_g(x_g))
+        return x_l, x_g
+class FFCResnetBlock(nn.Module):
+    def __init__(self, dim, padding_type, norm_layer, activation_layer=nn.ReLU, dilation=1,
+                 spatial_transform_kwargs=None, inline=False, **conv_kwargs):
+        super().__init__()
+        self.conv1 = FFC_BN_ACT(dim, dim, kernel_size=3, padding=dilation, dilation=dilation,
+                                norm_layer=norm_layer,
+                                activation_layer=activation_layer,
+                                padding_type=padding_type,
+                                **conv_kwargs)
+        self.conv2 = FFC_BN_ACT(dim, dim, kernel_size=3, padding=dilation, dilation=dilation,
+                                norm_layer=norm_layer,
+                                activation_layer=activation_layer,
+                                padding_type=padding_type,
+                                **conv_kwargs)
+        if spatial_transform_kwargs is not None:
+            self.conv1 = LearnableSpatialTransformWrapper(self.conv1, **spatial_transform_kwargs)
+            self.conv2 = LearnableSpatialTransformWrapper(self.conv2, **spatial_transform_kwargs)
+        self.inline = inline
+    def forward(self, x):
+        if self.inline:
+            x_l, x_g = x[:, :-self.conv1.ffc.global_in_num], x[:, -self.conv1.ffc.global_in_num:]
+        else:
+            x_l, x_g = x if type(x) is tuple else (x, 0)
+        id_l, id_g = x_l, x_g
+        x_l, x_g = self.conv1((x_l, x_g))
+        x_l, x_g = self.conv2((x_l, x_g))
+        x_l, x_g = id_l + x_l, id_g + x_g
+        out = x_l, x_g
+        if self.inline:
+            out = torch.cat(out, dim=1)
+        return out
+class ConcatTupleLayer(nn.Module):
+    def forward(self, x):
+        assert isinstance(x, tuple)
+        x_l, x_g = x
+        assert torch.is_tensor(x_l) or torch.is_tensor(x_g)
+        if not torch.is_tensor(x_g):
+            return x_l
+        return torch.cat(x, dim=1)
+class FFCResNetGenerator(nn.Module):
+    def __init__(self, input_nc, output_nc, ngf=64, n_downsampling=3, n_blocks=9, norm_layer=nn.BatchNorm2d,
+                 padding_type='reflect', activation_layer=nn.ReLU,
+                 up_norm_layer=nn.BatchNorm2d, up_activation=nn.ReLU(True),
+                 init_conv_kwargs={}, downsample_conv_kwargs={}, resnet_conv_kwargs={},
+                 spatial_transform_layers=None, spatial_transform_kwargs={},
+                 add_out_act=True, max_features=1024, out_ffc=False, out_ffc_kwargs={}):
+        assert (n_blocks >= 0)
+        super().__init__()
+        model = [nn.ReflectionPad2d(3),
+                 FFC_BN_ACT(input_nc, ngf, kernel_size=7, padding=0, norm_layer=norm_layer,
+                            activation_layer=activation_layer, **init_conv_kwargs)]
+        ### downsample
+        for i in range(n_downsampling):
+            mult = 2 ** i
+            if i == n_downsampling - 1:
+                cur_conv_kwargs = dict(downsample_conv_kwargs)
+                cur_conv_kwargs['ratio_gout'] = resnet_conv_kwargs.get('ratio_gin', 0)
+            else:
+                cur_conv_kwargs = downsample_conv_kwargs
+            model += [FFC_BN_ACT(min(max_features, ngf * mult),
+                                 min(max_features, ngf * mult * 2),
+                                 kernel_size=3, stride=2, padding=1,
+                                 norm_layer=norm_layer,
+                                 activation_layer=activation_layer,
+                                 **cur_conv_kwargs)]
+        mult = 2 ** n_downsampling
+        feats_num_bottleneck = min(max_features, ngf * mult)
+        ### resnet blocks
+        for i in range(n_blocks):
+            cur_resblock = FFCResnetBlock(feats_num_bottleneck, padding_type=padding_type, activation_layer=activation_layer,
+                                          norm_layer=norm_layer, **resnet_conv_kwargs)
+            if spatial_transform_layers is not None and i in spatial_transform_layers:
+                cur_resblock = LearnableSpatialTransformWrapper(cur_resblock, **spatial_transform_kwargs)
+            model += [cur_resblock]
+        model += [ConcatTupleLayer()]
+        ### upsample
+        for i in range(n_downsampling):
+            mult = 2 ** (n_downsampling - i)
+            model += [nn.ConvTranspose2d(min(max_features, ngf * mult),
+                                         min(max_features, int(ngf * mult / 2)),
+                                         kernel_size=3, stride=2, padding=1, output_padding=1),
+                      up_norm_layer(min(max_features, int(ngf * mult / 2))),
+                      up_activation]
+        if out_ffc:
+            model += [FFCResnetBlock(ngf, padding_type=padding_type, activation_layer=activation_layer,
+                                     norm_layer=norm_layer, inline=True, **out_ffc_kwargs)]
+        model += [nn.ReflectionPad2d(3),
+                  nn.Conv2d(ngf, output_nc, kernel_size=7, padding=0)]
+        if add_out_act:
+            model.append(get_activation('tanh' if add_out_act is True else add_out_act))
+        self.model = nn.Sequential(*model)
+    def forward(self, input):
+        return self.model(input)
+class FFCNLayerDiscriminator(BaseDiscriminator):
+    def __init__(self, input_nc, ndf=64, n_layers=3, norm_layer=nn.BatchNorm2d, max_features=512,
+                 init_conv_kwargs={}, conv_kwargs={}):
+        super().__init__()
+        self.n_layers = n_layers
+        def _act_ctor(inplace=True):
+            return nn.LeakyReLU(negative_slope=0.2, inplace=inplace)
+        kw = 3
+        padw = int(np.ceil((kw-1.0)/2))
+        sequence = [[FFC_BN_ACT(input_nc, ndf, kernel_size=kw, padding=padw, norm_layer=norm_layer,
+                                activation_layer=_act_ctor, **init_conv_kwargs)]]
+        nf = ndf
+        for n in range(1, n_layers):
+            nf_prev = nf
+            nf = min(nf * 2, max_features)
+            cur_model = [
+                FFC_BN_ACT(nf_prev, nf,
+                           kernel_size=kw, stride=2, padding=padw,
+                           norm_layer=norm_layer,
+                           activation_layer=_act_ctor,
+                           **conv_kwargs)
+            ]
+            sequence.append(cur_model)
+        nf_prev = nf
+        nf = min(nf * 2, 512)
+        cur_model = [
+            FFC_BN_ACT(nf_prev, nf,
+                       kernel_size=kw, stride=1, padding=padw,
+                       norm_layer=norm_layer,
+                       activation_layer=lambda *args, **kwargs: nn.LeakyReLU(*args, negative_slope=0.2, **kwargs),
+                       **conv_kwargs),
+            ConcatTupleLayer()
+        ]
+        sequence.append(cur_model)
+        sequence += [[nn.Conv2d(nf, 1, kernel_size=kw, stride=1, padding=padw)]]
+        for n in range(len(sequence)):
+            setattr(self, 'model'+str(n), nn.Sequential(*sequence[n]))
+    def get_all_activations(self, x):
+        res = [x]
+        for n in range(self.n_layers + 2):
+            model = getattr(self, 'model' + str(n))
+            res.append(model(res[-1]))
+        return res[1:]
+    def forward(self, x):
+        act = self.get_all_activations(x)
+        feats = []
+        for out in act[:-1]:
+            if isinstance(out, tuple):
+                if torch.is_tensor(out[1]):
+                    out = torch.cat(out, dim=1)
+                else:
+                    out = out[0]
+            feats.append(out)
+        return act[-1], feats

annotator/lama/saicinpainting/training/modules/multidilated_conv.py ADDED Viewed

	@@ -0,0 +1,98 @@

+import torch
+import torch.nn as nn
+import random
+from annotator.lama.saicinpainting.training.modules.depthwise_sep_conv import DepthWiseSeperableConv
+class MultidilatedConv(nn.Module):
+    def __init__(self, in_dim, out_dim, kernel_size, dilation_num=3, comb_mode='sum', equal_dim=True,
+                 shared_weights=False, padding=1, min_dilation=1, shuffle_in_channels=False, use_depthwise=False, **kwargs):
+        super().__init__()
+        convs = []
+        self.equal_dim = equal_dim
+        assert comb_mode in ('cat_out', 'sum', 'cat_in', 'cat_both'), comb_mode
+        if comb_mode in ('cat_out', 'cat_both'):
+            self.cat_out = True
+            if equal_dim:
+                assert out_dim % dilation_num == 0
+                out_dims = [out_dim // dilation_num] * dilation_num
+                self.index = sum([[i + j * (out_dims[0]) for j in range(dilation_num)] for i in range(out_dims[0])], [])
+            else:
+                out_dims = [out_dim // 2 ** (i + 1) for i in range(dilation_num - 1)]
+                out_dims.append(out_dim - sum(out_dims))
+                index = []
+                starts = [0] + out_dims[:-1]
+                lengths = [out_dims[i] // out_dims[-1] for i in range(dilation_num)]
+                for i in range(out_dims[-1]):
+                    for j in range(dilation_num):
+                        index += list(range(starts[j], starts[j] + lengths[j]))
+                        starts[j] += lengths[j]
+                self.index = index
+                assert(len(index) == out_dim)
+            self.out_dims = out_dims
+        else:
+            self.cat_out = False
+            self.out_dims = [out_dim] * dilation_num
+        if comb_mode in ('cat_in', 'cat_both'):
+            if equal_dim:
+                assert in_dim % dilation_num == 0
+                in_dims = [in_dim // dilation_num] * dilation_num
+            else:
+                in_dims = [in_dim // 2 ** (i + 1) for i in range(dilation_num - 1)]
+                in_dims.append(in_dim - sum(in_dims))
+            self.in_dims = in_dims
+            self.cat_in = True
+        else:
+            self.cat_in = False
+            self.in_dims = [in_dim] * dilation_num
+        conv_type = DepthWiseSeperableConv if use_depthwise else nn.Conv2d
+        dilation = min_dilation
+        for i in range(dilation_num):
+            if isinstance(padding, int):
+                cur_padding = padding * dilation
+            else:
+                cur_padding = padding[i]
+            convs.append(conv_type(
+                self.in_dims[i], self.out_dims[i], kernel_size, padding=cur_padding, dilation=dilation, **kwargs
+            ))
+            if i > 0 and shared_weights:
+                convs[-1].weight = convs[0].weight
+                convs[-1].bias = convs[0].bias
+            dilation *= 2
+        self.convs = nn.ModuleList(convs)
+        self.shuffle_in_channels = shuffle_in_channels
+        if self.shuffle_in_channels:
+            # shuffle list as shuffling of tensors is nondeterministic
+            in_channels_permute = list(range(in_dim))
+            random.shuffle(in_channels_permute)
+            # save as buffer so it is saved and loaded with checkpoint
+            self.register_buffer('in_channels_permute', torch.tensor(in_channels_permute))
+    def forward(self, x):
+        if self.shuffle_in_channels:
+            x = x[:, self.in_channels_permute]
+        outs = []
+        if self.cat_in:
+            if self.equal_dim:
+                x = x.chunk(len(self.convs), dim=1)
+            else:
+                new_x = []
+                start = 0
+                for dim in self.in_dims:
+                    new_x.append(x[:, start:start+dim])
+                    start += dim
+                x = new_x
+        for i, conv in enumerate(self.convs):
+            if self.cat_in:
+                input = x[i]
+            else:
+                input = x
+            outs.append(conv(input))
+        if self.cat_out:
+            out = torch.cat(outs, dim=1)[:, self.index]
+        else:
+            out = sum(outs)
+        return out

annotator/lama/saicinpainting/training/modules/multiscale.py ADDED Viewed

	@@ -0,0 +1,244 @@

+from typing import List, Tuple, Union, Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from annotator.lama.saicinpainting.training.modules.base import get_conv_block_ctor, get_activation
+from annotator.lama.saicinpainting.training.modules.pix2pixhd import ResnetBlock
+class ResNetHead(nn.Module):
+    def __init__(self, input_nc, ngf=64, n_downsampling=3, n_blocks=9, norm_layer=nn.BatchNorm2d,
+                 padding_type='reflect', conv_kind='default', activation=nn.ReLU(True)):
+        assert (n_blocks >= 0)
+        super(ResNetHead, self).__init__()
+        conv_layer = get_conv_block_ctor(conv_kind)
+        model = [nn.ReflectionPad2d(3),
+                 conv_layer(input_nc, ngf, kernel_size=7, padding=0),
+                 norm_layer(ngf),
+                 activation]
+        ### downsample
+        for i in range(n_downsampling):
+            mult = 2 ** i
+            model += [conv_layer(ngf * mult, ngf * mult * 2, kernel_size=3, stride=2, padding=1),
+                      norm_layer(ngf * mult * 2),
+                      activation]
+        mult = 2 ** n_downsampling
+        ### resnet blocks
+        for i in range(n_blocks):
+            model += [ResnetBlock(ngf * mult, padding_type=padding_type, activation=activation, norm_layer=norm_layer,
+                                  conv_kind=conv_kind)]
+        self.model = nn.Sequential(*model)
+    def forward(self, input):
+        return self.model(input)
+class ResNetTail(nn.Module):
+    def __init__(self, output_nc, ngf=64, n_downsampling=3, n_blocks=9, norm_layer=nn.BatchNorm2d,
+                 padding_type='reflect', conv_kind='default', activation=nn.ReLU(True),
+                 up_norm_layer=nn.BatchNorm2d, up_activation=nn.ReLU(True), add_out_act=False, out_extra_layers_n=0,
+                 add_in_proj=None):
+        assert (n_blocks >= 0)
+        super(ResNetTail, self).__init__()
+        mult = 2 ** n_downsampling
+        model = []
+        if add_in_proj is not None:
+            model.append(nn.Conv2d(add_in_proj, ngf * mult, kernel_size=1))
+        ### resnet blocks
+        for i in range(n_blocks):
+            model += [ResnetBlock(ngf * mult, padding_type=padding_type, activation=activation, norm_layer=norm_layer,
+                                  conv_kind=conv_kind)]
+        ### upsample
+        for i in range(n_downsampling):
+            mult = 2 ** (n_downsampling - i)
+            model += [nn.ConvTranspose2d(ngf * mult, int(ngf * mult / 2), kernel_size=3, stride=2, padding=1,
+                                         output_padding=1),
+                      up_norm_layer(int(ngf * mult / 2)),
+                      up_activation]
+        self.model = nn.Sequential(*model)
+        out_layers = []
+        for _ in range(out_extra_layers_n):
+            out_layers += [nn.Conv2d(ngf, ngf, kernel_size=1, padding=0),
+                           up_norm_layer(ngf),
+                           up_activation]
+        out_layers += [nn.ReflectionPad2d(3),
+                       nn.Conv2d(ngf, output_nc, kernel_size=7, padding=0)]
+        if add_out_act:
+            out_layers.append(get_activation('tanh' if add_out_act is True else add_out_act))
+        self.out_proj = nn.Sequential(*out_layers)
+    def forward(self, input, return_last_act=False):
+        features = self.model(input)
+        out = self.out_proj(features)
+        if return_last_act:
+            return out, features
+        else:
+            return out
+class MultiscaleResNet(nn.Module):
+    def __init__(self, input_nc, output_nc, ngf=64, n_downsampling=2, n_blocks_head=2, n_blocks_tail=6, n_scales=3,
+                 norm_layer=nn.BatchNorm2d, padding_type='reflect', conv_kind='default', activation=nn.ReLU(True),
+                 up_norm_layer=nn.BatchNorm2d, up_activation=nn.ReLU(True), add_out_act=False, out_extra_layers_n=0,
+                 out_cumulative=False, return_only_hr=False):
+        super().__init__()
+        self.heads = nn.ModuleList([ResNetHead(input_nc, ngf=ngf, n_downsampling=n_downsampling,
+                                               n_blocks=n_blocks_head, norm_layer=norm_layer, padding_type=padding_type,
+                                               conv_kind=conv_kind, activation=activation)
+                                    for i in range(n_scales)])
+        tail_in_feats = ngf * (2 ** n_downsampling) + ngf
+        self.tails = nn.ModuleList([ResNetTail(output_nc,
+                                               ngf=ngf, n_downsampling=n_downsampling,
+                                               n_blocks=n_blocks_tail, norm_layer=norm_layer, padding_type=padding_type,
+                                               conv_kind=conv_kind, activation=activation, up_norm_layer=up_norm_layer,
+                                               up_activation=up_activation, add_out_act=add_out_act,
+                                               out_extra_layers_n=out_extra_layers_n,
+                                               add_in_proj=None if (i == n_scales - 1) else tail_in_feats)
+                                    for i in range(n_scales)])
+        self.out_cumulative = out_cumulative
+        self.return_only_hr = return_only_hr
+    @property
+    def num_scales(self):
+        return len(self.heads)
+    def forward(self, ms_inputs: List[torch.Tensor], smallest_scales_num: Optional[int] = None) \
+        -> Union[torch.Tensor, List[torch.Tensor]]:
+        """
+        :param ms_inputs: List of inputs of different resolutions from HR to LR
+        :param smallest_scales_num: int or None, number of smallest scales to take at input
+        :return: Depending on return_only_hr:
+            True: Only the most HR output
+            False: List of outputs of different resolutions from HR to LR
+        """
+        if smallest_scales_num is None:
+            assert len(self.heads) == len(ms_inputs), (len(self.heads), len(ms_inputs), smallest_scales_num)
+            smallest_scales_num = len(self.heads)
+        else:
+            assert smallest_scales_num == len(ms_inputs) <= len(self.heads), (len(self.heads), len(ms_inputs), smallest_scales_num)
+        cur_heads = self.heads[-smallest_scales_num:]
+        ms_features = [cur_head(cur_inp) for cur_head, cur_inp in zip(cur_heads, ms_inputs)]
+        all_outputs = []
+        prev_tail_features = None
+        for i in range(len(ms_features)):
+            scale_i = -i - 1
+            cur_tail_input = ms_features[-i - 1]
+            if prev_tail_features is not None:
+                if prev_tail_features.shape != cur_tail_input.shape:
+                    prev_tail_features = F.interpolate(prev_tail_features, size=cur_tail_input.shape[2:],
+                                                       mode='bilinear', align_corners=False)
+                cur_tail_input = torch.cat((cur_tail_input, prev_tail_features), dim=1)
+            cur_out, cur_tail_feats = self.tails[scale_i](cur_tail_input, return_last_act=True)
+            prev_tail_features = cur_tail_feats
+            all_outputs.append(cur_out)
+        if self.out_cumulative:
+            all_outputs_cum = [all_outputs[0]]
+            for i in range(1, len(ms_features)):
+                cur_out = all_outputs[i]
+                cur_out_cum = cur_out + F.interpolate(all_outputs_cum[-1], size=cur_out.shape[2:],
+                                                      mode='bilinear', align_corners=False)
+                all_outputs_cum.append(cur_out_cum)
+            all_outputs = all_outputs_cum
+        if self.return_only_hr:
+            return all_outputs[-1]
+        else:
+            return all_outputs[::-1]
+class MultiscaleDiscriminatorSimple(nn.Module):
+    def __init__(self, ms_impl):
+        super().__init__()
+        self.ms_impl = nn.ModuleList(ms_impl)
+    @property
+    def num_scales(self):
+        return len(self.ms_impl)
+    def forward(self, ms_inputs: List[torch.Tensor], smallest_scales_num: Optional[int] = None) \
+            -> List[Tuple[torch.Tensor, List[torch.Tensor]]]:
+        """
+        :param ms_inputs: List of inputs of different resolutions from HR to LR
+        :param smallest_scales_num: int or None, number of smallest scales to take at input
+        :return: List of pairs (prediction, features) for different resolutions from HR to LR
+        """
+        if smallest_scales_num is None:
+            assert len(self.ms_impl) == len(ms_inputs), (len(self.ms_impl), len(ms_inputs), smallest_scales_num)
+            smallest_scales_num = len(self.heads)
+        else:
+            assert smallest_scales_num == len(ms_inputs) <= len(self.ms_impl), \
+                (len(self.ms_impl), len(ms_inputs), smallest_scales_num)
+        return [cur_discr(cur_input) for cur_discr, cur_input in zip(self.ms_impl[-smallest_scales_num:], ms_inputs)]
+class SingleToMultiScaleInputMixin:
+    def forward(self, x: torch.Tensor) -> List:
+        orig_height, orig_width = x.shape[2:]
+        factors = [2 ** i for i in range(self.num_scales)]
+        ms_inputs = [F.interpolate(x, size=(orig_height // f, orig_width // f), mode='bilinear', align_corners=False)
+                     for f in factors]
+        return super().forward(ms_inputs)
+class GeneratorMultiToSingleOutputMixin:
+    def forward(self, x):
+        return super().forward(x)[0]
+class DiscriminatorMultiToSingleOutputMixin:
+    def forward(self, x):
+        out_feat_tuples = super().forward(x)
+        return out_feat_tuples[0][0], [f for _, flist in out_feat_tuples for f in flist]
+class DiscriminatorMultiToSingleOutputStackedMixin:
+    def __init__(self, *args, return_feats_only_levels=None, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.return_feats_only_levels = return_feats_only_levels
+    def forward(self, x):
+        out_feat_tuples = super().forward(x)
+        outs = [out for out, _ in out_feat_tuples]
+        scaled_outs = [outs[0]] + [F.interpolate(cur_out, size=outs[0].shape[-2:],
+                                                 mode='bilinear', align_corners=False)
+                                   for cur_out in outs[1:]]
+        out = torch.cat(scaled_outs, dim=1)
+        if self.return_feats_only_levels is not None:
+            feat_lists = [out_feat_tuples[i][1] for i in self.return_feats_only_levels]
+        else:
+            feat_lists = [flist for _, flist in out_feat_tuples]
+        feats = [f for flist in feat_lists for f in flist]
+        return out, feats
+class MultiscaleDiscrSingleInput(SingleToMultiScaleInputMixin, DiscriminatorMultiToSingleOutputStackedMixin, MultiscaleDiscriminatorSimple):
+    pass
+class MultiscaleResNetSingle(GeneratorMultiToSingleOutputMixin, SingleToMultiScaleInputMixin, MultiscaleResNet):
+    pass

annotator/lama/saicinpainting/training/modules/pix2pixhd.py ADDED Viewed

	@@ -0,0 +1,669 @@

+# original: https://github.com/NVIDIA/pix2pixHD/blob/master/models/networks.py
+import collections
+from functools import partial
+import functools
+import logging
+from collections import defaultdict
+import numpy as np
+import torch.nn as nn
+from annotator.lama.saicinpainting.training.modules.base import BaseDiscriminator, deconv_factory, get_conv_block_ctor, get_norm_layer, get_activation
+from annotator.lama.saicinpainting.training.modules.ffc import FFCResnetBlock
+from annotator.lama.saicinpainting.training.modules.multidilated_conv import MultidilatedConv
+class DotDict(defaultdict):
+    # https://stackoverflow.com/questions/2352181/how-to-use-a-dot-to-access-members-of-dictionary
+    """dot.notation access to dictionary attributes"""
+    __getattr__ = defaultdict.get
+    __setattr__ = defaultdict.__setitem__
+    __delattr__ = defaultdict.__delitem__
+class Identity(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def forward(self, x):
+        return x
+class ResnetBlock(nn.Module):
+    def __init__(self, dim, padding_type, norm_layer, activation=nn.ReLU(True), use_dropout=False, conv_kind='default',
+                 dilation=1, in_dim=None, groups=1, second_dilation=None):
+        super(ResnetBlock, self).__init__()
+        self.in_dim = in_dim
+        self.dim = dim
+        if second_dilation is None:
+            second_dilation = dilation
+        self.conv_block = self.build_conv_block(dim, padding_type, norm_layer, activation, use_dropout,
+                                                conv_kind=conv_kind, dilation=dilation, in_dim=in_dim, groups=groups,
+                                                second_dilation=second_dilation)
+        if self.in_dim is not None:
+            self.input_conv = nn.Conv2d(in_dim, dim, 1)
+        self.out_channnels = dim
+    def build_conv_block(self, dim, padding_type, norm_layer, activation, use_dropout, conv_kind='default',
+                         dilation=1, in_dim=None, groups=1, second_dilation=1):
+        conv_layer = get_conv_block_ctor(conv_kind)
+        conv_block = []
+        p = 0
+        if padding_type == 'reflect':
+            conv_block += [nn.ReflectionPad2d(dilation)]
+        elif padding_type == 'replicate':
+            conv_block += [nn.ReplicationPad2d(dilation)]
+        elif padding_type == 'zero':
+            p = dilation
+        else:
+            raise NotImplementedError('padding [%s] is not implemented' % padding_type)
+        if in_dim is None:
+            in_dim = dim
+        conv_block += [conv_layer(in_dim, dim, kernel_size=3, padding=p, dilation=dilation),
+                       norm_layer(dim),
+                       activation]
+        if use_dropout:
+            conv_block += [nn.Dropout(0.5)]
+        p = 0
+        if padding_type == 'reflect':
+            conv_block += [nn.ReflectionPad2d(second_dilation)]
+        elif padding_type == 'replicate':
+            conv_block += [nn.ReplicationPad2d(second_dilation)]
+        elif padding_type == 'zero':
+            p = second_dilation
+        else:
+            raise NotImplementedError('padding [%s] is not implemented' % padding_type)
+        conv_block += [conv_layer(dim, dim, kernel_size=3, padding=p, dilation=second_dilation, groups=groups),
+                       norm_layer(dim)]
+        return nn.Sequential(*conv_block)
+    def forward(self, x):
+        x_before = x
+        if self.in_dim is not None:
+            x = self.input_conv(x)
+        out = x + self.conv_block(x_before)
+        return out
+class ResnetBlock5x5(nn.Module):
+    def __init__(self, dim, padding_type, norm_layer, activation=nn.ReLU(True), use_dropout=False, conv_kind='default',
+                 dilation=1, in_dim=None, groups=1, second_dilation=None):
+        super(ResnetBlock5x5, self).__init__()
+        self.in_dim = in_dim
+        self.dim = dim
+        if second_dilation is None:
+            second_dilation = dilation
+        self.conv_block = self.build_conv_block(dim, padding_type, norm_layer, activation, use_dropout,
+                                                conv_kind=conv_kind, dilation=dilation, in_dim=in_dim, groups=groups,
+                                                second_dilation=second_dilation)
+        if self.in_dim is not None:
+            self.input_conv = nn.Conv2d(in_dim, dim, 1)
+        self.out_channnels = dim
+    def build_conv_block(self, dim, padding_type, norm_layer, activation, use_dropout, conv_kind='default',
+                         dilation=1, in_dim=None, groups=1, second_dilation=1):
+        conv_layer = get_conv_block_ctor(conv_kind)
+        conv_block = []
+        p = 0
+        if padding_type == 'reflect':
+            conv_block += [nn.ReflectionPad2d(dilation * 2)]
+        elif padding_type == 'replicate':
+            conv_block += [nn.ReplicationPad2d(dilation * 2)]
+        elif padding_type == 'zero':
+            p = dilation * 2
+        else:
+            raise NotImplementedError('padding [%s] is not implemented' % padding_type)
+        if in_dim is None:
+            in_dim = dim
+        conv_block += [conv_layer(in_dim, dim, kernel_size=5, padding=p, dilation=dilation),
+                       norm_layer(dim),
+                       activation]
+        if use_dropout:
+            conv_block += [nn.Dropout(0.5)]
+        p = 0
+        if padding_type == 'reflect':
+            conv_block += [nn.ReflectionPad2d(second_dilation * 2)]
+        elif padding_type == 'replicate':
+            conv_block += [nn.ReplicationPad2d(second_dilation * 2)]
+        elif padding_type == 'zero':
+            p = second_dilation * 2
+        else:
+            raise NotImplementedError('padding [%s] is not implemented' % padding_type)
+        conv_block += [conv_layer(dim, dim, kernel_size=5, padding=p, dilation=second_dilation, groups=groups),
+                       norm_layer(dim)]
+        return nn.Sequential(*conv_block)
+    def forward(self, x):
+        x_before = x
+        if self.in_dim is not None:
+            x = self.input_conv(x)
+        out = x + self.conv_block(x_before)
+        return out
+class MultidilatedResnetBlock(nn.Module):
+    def __init__(self, dim, padding_type, conv_layer, norm_layer, activation=nn.ReLU(True), use_dropout=False):
+        super().__init__()
+        self.conv_block = self.build_conv_block(dim, padding_type, conv_layer, norm_layer, activation, use_dropout)
+    def build_conv_block(self, dim, padding_type, conv_layer, norm_layer, activation, use_dropout, dilation=1):
+        conv_block = []
+        conv_block += [conv_layer(dim, dim, kernel_size=3, padding_mode=padding_type),
+                       norm_layer(dim),
+                       activation]
+        if use_dropout:
+            conv_block += [nn.Dropout(0.5)]
+        conv_block += [conv_layer(dim, dim, kernel_size=3, padding_mode=padding_type),
+                       norm_layer(dim)]
+        return nn.Sequential(*conv_block)
+    def forward(self, x):
+        out = x + self.conv_block(x)
+        return out
+class MultiDilatedGlobalGenerator(nn.Module):
+    def __init__(self, input_nc, output_nc, ngf=64, n_downsampling=3,
+                 n_blocks=3, norm_layer=nn.BatchNorm2d,
+                 padding_type='reflect', conv_kind='default',
+                 deconv_kind='convtranspose', activation=nn.ReLU(True),
+                 up_norm_layer=nn.BatchNorm2d, affine=None, up_activation=nn.ReLU(True),
+                 add_out_act=True, max_features=1024, multidilation_kwargs={},
+                 ffc_positions=None, ffc_kwargs={}):
+        assert (n_blocks >= 0)
+        super().__init__()
+        conv_layer = get_conv_block_ctor(conv_kind)
+        resnet_conv_layer = functools.partial(get_conv_block_ctor('multidilated'), **multidilation_kwargs)
+        norm_layer = get_norm_layer(norm_layer)
+        if affine is not None:
+            norm_layer = partial(norm_layer, affine=affine)
+        up_norm_layer = get_norm_layer(up_norm_layer)
+        if affine is not None:
+            up_norm_layer = partial(up_norm_layer, affine=affine)
+        model = [nn.ReflectionPad2d(3),
+                 conv_layer(input_nc, ngf, kernel_size=7, padding=0),
+                 norm_layer(ngf),
+                 activation]
+        identity = Identity()
+        ### downsample
+        for i in range(n_downsampling):
+            mult = 2 ** i
+            model += [conv_layer(min(max_features, ngf * mult),
+                                    min(max_features, ngf * mult * 2),
+                                    kernel_size=3, stride=2, padding=1),
+                        norm_layer(min(max_features, ngf * mult * 2)),
+                        activation]
+        mult = 2 ** n_downsampling
+        feats_num_bottleneck = min(max_features, ngf * mult)
+        ### resnet blocks
+        for i in range(n_blocks):
+            if ffc_positions is not None and i in ffc_positions:
+                model += [FFCResnetBlock(feats_num_bottleneck, padding_type, norm_layer, activation_layer=nn.ReLU,
+                                         inline=True, **ffc_kwargs)]
+            model += [MultidilatedResnetBlock(feats_num_bottleneck, padding_type=padding_type,
+                                              conv_layer=resnet_conv_layer, activation=activation,
+                                              norm_layer=norm_layer)]
+        ### upsample
+        for i in range(n_downsampling):
+            mult = 2 ** (n_downsampling - i)
+            model += deconv_factory(deconv_kind, ngf, mult, up_norm_layer, up_activation, max_features)
+        model += [nn.ReflectionPad2d(3),
+                  nn.Conv2d(ngf, output_nc, kernel_size=7, padding=0)]
+        if add_out_act:
+            model.append(get_activation('tanh' if add_out_act is True else add_out_act))
+        self.model = nn.Sequential(*model)
+    def forward(self, input):
+        return self.model(input)
+class ConfigGlobalGenerator(nn.Module):
+    def __init__(self, input_nc, output_nc, ngf=64, n_downsampling=3,
+                 n_blocks=3, norm_layer=nn.BatchNorm2d,
+                 padding_type='reflect', conv_kind='default',
+                 deconv_kind='convtranspose', activation=nn.ReLU(True),
+                 up_norm_layer=nn.BatchNorm2d, affine=None, up_activation=nn.ReLU(True),
+                 add_out_act=True, max_features=1024,
+                 manual_block_spec=[],
+                 resnet_block_kind='multidilatedresnetblock',
+                 resnet_conv_kind='multidilated',
+                 resnet_dilation=1,
+                 multidilation_kwargs={}):
+        assert (n_blocks >= 0)
+        super().__init__()
+        conv_layer = get_conv_block_ctor(conv_kind)
+        resnet_conv_layer = functools.partial(get_conv_block_ctor(resnet_conv_kind), **multidilation_kwargs)
+        norm_layer = get_norm_layer(norm_layer)
+        if affine is not None:
+            norm_layer = partial(norm_layer, affine=affine)
+        up_norm_layer = get_norm_layer(up_norm_layer)
+        if affine is not None:
+            up_norm_layer = partial(up_norm_layer, affine=affine)
+        model = [nn.ReflectionPad2d(3),
+                 conv_layer(input_nc, ngf, kernel_size=7, padding=0),
+                 norm_layer(ngf),
+                 activation]
+        identity = Identity()
+        ### downsample
+        for i in range(n_downsampling):
+            mult = 2 ** i
+            model += [conv_layer(min(max_features, ngf * mult),
+                                    min(max_features, ngf * mult * 2),
+                                    kernel_size=3, stride=2, padding=1),
+                        norm_layer(min(max_features, ngf * mult * 2)),
+                        activation]
+        mult = 2 ** n_downsampling
+        feats_num_bottleneck = min(max_features, ngf * mult)
+        if len(manual_block_spec) == 0:
+            manual_block_spec = [
+                DotDict(lambda : None, {
+                    'n_blocks': n_blocks,
+                    'use_default': True})
+            ]
+        ### resnet blocks
+        for block_spec in manual_block_spec:
+            def make_and_add_blocks(model, block_spec):
+                block_spec = DotDict(lambda : None, block_spec)
+                if not block_spec.use_default:
+                    resnet_conv_layer = functools.partial(get_conv_block_ctor(block_spec.resnet_conv_kind), **block_spec.multidilation_kwargs)
+                    resnet_conv_kind = block_spec.resnet_conv_kind
+                    resnet_block_kind = block_spec.resnet_block_kind
+                    if block_spec.resnet_dilation is not None:
+                        resnet_dilation = block_spec.resnet_dilation
+                for i in range(block_spec.n_blocks):
+                    if resnet_block_kind == "multidilatedresnetblock":
+                        model += [MultidilatedResnetBlock(feats_num_bottleneck, padding_type=padding_type,
+                                                        conv_layer=resnet_conv_layer, activation=activation,
+                                                        norm_layer=norm_layer)]
+                    if resnet_block_kind == "resnetblock":
+                        model += [ResnetBlock(ngf * mult, padding_type=padding_type, activation=activation, norm_layer=norm_layer,
+                                            conv_kind=resnet_conv_kind)]
+                    if resnet_block_kind == "resnetblock5x5":
+                        model += [ResnetBlock5x5(ngf * mult, padding_type=padding_type, activation=activation, norm_layer=norm_layer,
+                                            conv_kind=resnet_conv_kind)]
+                    if resnet_block_kind == "resnetblockdwdil":
+                        model += [ResnetBlock(ngf * mult, padding_type=padding_type, activation=activation, norm_layer=norm_layer,
+                                            conv_kind=resnet_conv_kind, dilation=resnet_dilation, second_dilation=resnet_dilation)]
+            make_and_add_blocks(model, block_spec)
+        ### upsample
+        for i in range(n_downsampling):
+            mult = 2 ** (n_downsampling - i)
+            model += deconv_factory(deconv_kind, ngf, mult, up_norm_layer, up_activation, max_features)
+        model += [nn.ReflectionPad2d(3),
+                  nn.Conv2d(ngf, output_nc, kernel_size=7, padding=0)]
+        if add_out_act:
+            model.append(get_activation('tanh' if add_out_act is True else add_out_act))
+        self.model = nn.Sequential(*model)
+    def forward(self, input):
+        return self.model(input)
+def make_dil_blocks(dilated_blocks_n, dilation_block_kind, dilated_block_kwargs):
+    blocks = []
+    for i in range(dilated_blocks_n):
+        if dilation_block_kind == 'simple':
+            blocks.append(ResnetBlock(**dilated_block_kwargs, dilation=2 ** (i + 1)))
+        elif dilation_block_kind == 'multi':
+            blocks.append(MultidilatedResnetBlock(**dilated_block_kwargs))
+        else:
+            raise ValueError(f'dilation_block_kind could not be "{dilation_block_kind}"')
+    return blocks
+class GlobalGenerator(nn.Module):
+    def __init__(self, input_nc, output_nc, ngf=64, n_downsampling=3, n_blocks=9, norm_layer=nn.BatchNorm2d,
+                 padding_type='reflect', conv_kind='default', activation=nn.ReLU(True),
+                 up_norm_layer=nn.BatchNorm2d, affine=None,
+                 up_activation=nn.ReLU(True), dilated_blocks_n=0, dilated_blocks_n_start=0,
+                 dilated_blocks_n_middle=0,
+                 add_out_act=True,
+                 max_features=1024, is_resblock_depthwise=False,
+                 ffc_positions=None, ffc_kwargs={}, dilation=1, second_dilation=None,
+                 dilation_block_kind='simple', multidilation_kwargs={}):
+        assert (n_blocks >= 0)
+        super().__init__()
+        conv_layer = get_conv_block_ctor(conv_kind)
+        norm_layer = get_norm_layer(norm_layer)
+        if affine is not None:
+            norm_layer = partial(norm_layer, affine=affine)
+        up_norm_layer = get_norm_layer(up_norm_layer)
+        if affine is not None:
+            up_norm_layer = partial(up_norm_layer, affine=affine)
+        if ffc_positions is not None:
+            ffc_positions = collections.Counter(ffc_positions)
+        model = [nn.ReflectionPad2d(3),
+                 conv_layer(input_nc, ngf, kernel_size=7, padding=0),
+                 norm_layer(ngf),
+                 activation]
+        identity = Identity()
+        ### downsample
+        for i in range(n_downsampling):
+            mult = 2 ** i
+            model += [conv_layer(min(max_features, ngf * mult),
+                                min(max_features, ngf * mult * 2),
+                                kernel_size=3, stride=2, padding=1),
+                        norm_layer(min(max_features, ngf * mult * 2)),
+                        activation]
+        mult = 2 ** n_downsampling
+        feats_num_bottleneck = min(max_features, ngf * mult)
+        dilated_block_kwargs = dict(dim=feats_num_bottleneck, padding_type=padding_type,
+                                    activation=activation, norm_layer=norm_layer)
+        if dilation_block_kind == 'simple':
+            dilated_block_kwargs['conv_kind'] = conv_kind
+        elif dilation_block_kind == 'multi':
+            dilated_block_kwargs['conv_layer'] = functools.partial(
+                get_conv_block_ctor('multidilated'), **multidilation_kwargs)
+        # dilated blocks at the start of the bottleneck sausage
+        if dilated_blocks_n_start is not None and dilated_blocks_n_start > 0:
+            model += make_dil_blocks(dilated_blocks_n_start, dilation_block_kind, dilated_block_kwargs)
+        # resnet blocks
+        for i in range(n_blocks):
+            # dilated blocks at the middle of the bottleneck sausage
+            if i == n_blocks // 2 and dilated_blocks_n_middle is not None and dilated_blocks_n_middle > 0:
+                model += make_dil_blocks(dilated_blocks_n_middle, dilation_block_kind, dilated_block_kwargs)
+            if ffc_positions is not None and i in ffc_positions:
+                for _ in range(ffc_positions[i]):  # same position can occur more than once
+                    model += [FFCResnetBlock(feats_num_bottleneck, padding_type, norm_layer, activation_layer=nn.ReLU,
+                                             inline=True, **ffc_kwargs)]
+            if is_resblock_depthwise:
+                resblock_groups = feats_num_bottleneck
+            else:
+                resblock_groups = 1
+            model += [ResnetBlock(feats_num_bottleneck, padding_type=padding_type, activation=activation,
+                                    norm_layer=norm_layer, conv_kind=conv_kind, groups=resblock_groups,
+                                    dilation=dilation, second_dilation=second_dilation)]
+        # dilated blocks at the end of the bottleneck sausage
+        if dilated_blocks_n is not None and dilated_blocks_n > 0:
+            model += make_dil_blocks(dilated_blocks_n, dilation_block_kind, dilated_block_kwargs)
+        # upsample
+        for i in range(n_downsampling):
+            mult = 2 ** (n_downsampling - i)
+            model += [nn.ConvTranspose2d(min(max_features, ngf * mult),
+                                         min(max_features, int(ngf * mult / 2)),
+                                         kernel_size=3, stride=2, padding=1, output_padding=1),
+                      up_norm_layer(min(max_features, int(ngf * mult / 2))),
+                      up_activation]
+        model += [nn.ReflectionPad2d(3),
+                  nn.Conv2d(ngf, output_nc, kernel_size=7, padding=0)]
+        if add_out_act:
+            model.append(get_activation('tanh' if add_out_act is True else add_out_act))
+        self.model = nn.Sequential(*model)
+    def forward(self, input):
+        return self.model(input)
+class GlobalGeneratorGated(GlobalGenerator):
+    def __init__(self, *args, **kwargs):
+        real_kwargs=dict(
+            conv_kind='gated_bn_relu',
+            activation=nn.Identity(),
+            norm_layer=nn.Identity
+        )
+        real_kwargs.update(kwargs)
+        super().__init__(*args, **real_kwargs)
+class GlobalGeneratorFromSuperChannels(nn.Module):
+    def __init__(self, input_nc, output_nc, n_downsampling, n_blocks, super_channels, norm_layer="bn", padding_type='reflect', add_out_act=True):
+        super().__init__()
+        self.n_downsampling = n_downsampling
+        norm_layer = get_norm_layer(norm_layer)
+        if type(norm_layer) == functools.partial:
+            use_bias = (norm_layer.func == nn.InstanceNorm2d)
+        else:
+            use_bias = (norm_layer == nn.InstanceNorm2d)
+        channels = self.convert_super_channels(super_channels)
+        self.channels = channels
+        model = [nn.ReflectionPad2d(3),
+                 nn.Conv2d(input_nc, channels[0], kernel_size=7, padding=0, bias=use_bias),
+                 norm_layer(channels[0]),
+                 nn.ReLU(True)]
+        for i in range(n_downsampling):  # add downsampling layers
+            mult = 2 ** i
+            model += [nn.Conv2d(channels[0+i], channels[1+i], kernel_size=3, stride=2, padding=1, bias=use_bias),
+                      norm_layer(channels[1+i]),
+                      nn.ReLU(True)]
+        mult = 2 ** n_downsampling
+        n_blocks1 = n_blocks // 3
+        n_blocks2 = n_blocks1
+        n_blocks3 = n_blocks - n_blocks1 - n_blocks2
+        for i in range(n_blocks1):
+            c = n_downsampling
+            dim = channels[c]
+            model += [ResnetBlock(dim, padding_type=padding_type, norm_layer=norm_layer)]
+        for i in range(n_blocks2):
+            c = n_downsampling+1
+            dim = channels[c]
+            kwargs = {}
+            if i == 0:
+                kwargs = {"in_dim": channels[c-1]}
+            model += [ResnetBlock(dim, padding_type=padding_type, norm_layer=norm_layer, **kwargs)]
+        for i in range(n_blocks3):
+            c = n_downsampling+2
+            dim = channels[c]
+            kwargs = {}
+            if i == 0:
+                kwargs = {"in_dim": channels[c-1]}
+            model += [ResnetBlock(dim, padding_type=padding_type, norm_layer=norm_layer, **kwargs)]
+        for i in range(n_downsampling):  # add upsampling layers
+            mult = 2 ** (n_downsampling - i)
+            model += [nn.ConvTranspose2d(channels[n_downsampling+3+i],
+                                           channels[n_downsampling+3+i+1],
+                                           kernel_size=3, stride=2,
+                                           padding=1, output_padding=1,
+                                           bias=use_bias),
+                      norm_layer(channels[n_downsampling+3+i+1]),
+                      nn.ReLU(True)]
+        model += [nn.ReflectionPad2d(3)]
+        model += [nn.Conv2d(channels[2*n_downsampling+3], output_nc, kernel_size=7, padding=0)]
+        if add_out_act:
+            model.append(get_activation('tanh' if add_out_act is True else add_out_act))
+        self.model = nn.Sequential(*model)
+    def convert_super_channels(self, super_channels):
+        n_downsampling = self.n_downsampling
+        result = []
+        cnt = 0
+        if n_downsampling == 2:
+            N1 = 10
+        elif n_downsampling == 3:
+            N1 = 13
+        else:
+            raise NotImplementedError
+        for i in range(0, N1):
+            if i in [1,4,7,10]:
+                channel = super_channels[cnt] * (2 ** cnt)
+                config = {'channel': channel}
+                result.append(channel)
+                logging.info(f"Downsample channels {result[-1]}")
+                cnt += 1
+        for i in range(3):
+            for counter, j in enumerate(range(N1 + i * 3, N1 + 3 + i * 3)):
+                if len(super_channels) == 6:
+                    channel = super_channels[3] * 4
+                else:
+                    channel = super_channels[i + 3] * 4
+                config = {'channel': channel}
+                if counter == 0:
+                    result.append(channel)
+                    logging.info(f"Bottleneck channels {result[-1]}")
+        cnt = 2
+        for i in range(N1+9, N1+21):
+            if i in [22, 25,28]:
+                cnt -= 1
+                if len(super_channels) == 6:
+                    channel = super_channels[5 - cnt] * (2 ** cnt)
+                else:
+                    channel = super_channels[7 - cnt] * (2 ** cnt)
+                result.append(int(channel))
+                logging.info(f"Upsample channels {result[-1]}")
+        return result
+    def forward(self, input):
+        return self.model(input)
+# Defines the PatchGAN discriminator with the specified arguments.
+class NLayerDiscriminator(BaseDiscriminator):
+    def __init__(self, input_nc, ndf=64, n_layers=3, norm_layer=nn.BatchNorm2d,):
+        super().__init__()
+        self.n_layers = n_layers
+        kw = 4
+        padw = int(np.ceil((kw-1.0)/2))
+        sequence = [[nn.Conv2d(input_nc, ndf, kernel_size=kw, stride=2, padding=padw),
+                     nn.LeakyReLU(0.2, True)]]
+        nf = ndf
+        for n in range(1, n_layers):
+            nf_prev = nf
+            nf = min(nf * 2, 512)
+            cur_model = []
+            cur_model += [
+                nn.Conv2d(nf_prev, nf, kernel_size=kw, stride=2, padding=padw),
+                norm_layer(nf),
+                nn.LeakyReLU(0.2, True)
+            ]
+            sequence.append(cur_model)
+        nf_prev = nf
+        nf = min(nf * 2, 512)
+        cur_model = []
+        cur_model += [
+            nn.Conv2d(nf_prev, nf, kernel_size=kw, stride=1, padding=padw),
+            norm_layer(nf),
+            nn.LeakyReLU(0.2, True)
+        ]
+        sequence.append(cur_model)
+        sequence += [[nn.Conv2d(nf, 1, kernel_size=kw, stride=1, padding=padw)]]
+        for n in range(len(sequence)):
+            setattr(self, 'model'+str(n), nn.Sequential(*sequence[n]))
+    def get_all_activations(self, x):
+        res = [x]
+        for n in range(self.n_layers + 2):
+            model = getattr(self, 'model' + str(n))
+            res.append(model(res[-1]))
+        return res[1:]
+    def forward(self, x):
+        act = self.get_all_activations(x)
+        return act[-1], act[:-1]
+class MultidilatedNLayerDiscriminator(BaseDiscriminator):
+    def __init__(self, input_nc, ndf=64, n_layers=3, norm_layer=nn.BatchNorm2d, multidilation_kwargs={}):
+        super().__init__()
+        self.n_layers = n_layers
+        kw = 4
+        padw = int(np.ceil((kw-1.0)/2))
+        sequence = [[nn.Conv2d(input_nc, ndf, kernel_size=kw, stride=2, padding=padw),
+                     nn.LeakyReLU(0.2, True)]]
+        nf = ndf
+        for n in range(1, n_layers):
+            nf_prev = nf
+            nf = min(nf * 2, 512)
+            cur_model = []
+            cur_model += [
+                MultidilatedConv(nf_prev, nf, kernel_size=kw, stride=2, padding=[2, 3], **multidilation_kwargs),
+                norm_layer(nf),
+                nn.LeakyReLU(0.2, True)
+            ]
+            sequence.append(cur_model)
+        nf_prev = nf
+        nf = min(nf * 2, 512)
+        cur_model = []
+        cur_model += [
+            nn.Conv2d(nf_prev, nf, kernel_size=kw, stride=1, padding=padw),
+            norm_layer(nf),
+            nn.LeakyReLU(0.2, True)
+        ]
+        sequence.append(cur_model)
+        sequence += [[nn.Conv2d(nf, 1, kernel_size=kw, stride=1, padding=padw)]]
+        for n in range(len(sequence)):
+            setattr(self, 'model'+str(n), nn.Sequential(*sequence[n]))
+    def get_all_activations(self, x):
+        res = [x]
+        for n in range(self.n_layers + 2):
+            model = getattr(self, 'model' + str(n))
+            res.append(model(res[-1]))
+        return res[1:]
+    def forward(self, x):
+        act = self.get_all_activations(x)
+        return act[-1], act[:-1]
+class NLayerDiscriminatorAsGen(NLayerDiscriminator):
+    def forward(self, x):
+        return super().forward(x)[0]

annotator/lama/saicinpainting/training/modules/spatial_transform.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from kornia.geometry.transform import rotate
+class LearnableSpatialTransformWrapper(nn.Module):
+    def __init__(self, impl, pad_coef=0.5, angle_init_range=80, train_angle=True):
+        super().__init__()
+        self.impl = impl
+        self.angle = torch.rand(1) * angle_init_range
+        if train_angle:
+            self.angle = nn.Parameter(self.angle, requires_grad=True)
+        self.pad_coef = pad_coef
+    def forward(self, x):
+        if torch.is_tensor(x):
+            return self.inverse_transform(self.impl(self.transform(x)), x)
+        elif isinstance(x, tuple):
+            x_trans = tuple(self.transform(elem) for elem in x)
+            y_trans = self.impl(x_trans)
+            return tuple(self.inverse_transform(elem, orig_x) for elem, orig_x in zip(y_trans, x))
+        else:
+            raise ValueError(f'Unexpected input type {type(x)}')
+    def transform(self, x):
+        height, width = x.shape[2:]
+        pad_h, pad_w = int(height * self.pad_coef), int(width * self.pad_coef)
+        x_padded = F.pad(x, [pad_w, pad_w, pad_h, pad_h], mode='reflect')
+        x_padded_rotated = rotate(x_padded, angle=self.angle.to(x_padded))
+        return x_padded_rotated
+    def inverse_transform(self, y_padded_rotated, orig_x):
+        height, width = orig_x.shape[2:]
+        pad_h, pad_w = int(height * self.pad_coef), int(width * self.pad_coef)
+        y_padded = rotate(y_padded_rotated, angle=-self.angle.to(y_padded_rotated))
+        y_height, y_width = y_padded.shape[2:]
+        y = y_padded[:, :, pad_h : y_height - pad_h, pad_w : y_width - pad_w]
+        return y
+if __name__ == '__main__':
+    layer = LearnableSpatialTransformWrapper(nn.Identity())
+    x = torch.arange(2* 3 * 15 * 15).view(2, 3, 15, 15).float()
+    y = layer(x)
+    assert x.shape == y.shape
+    assert torch.allclose(x[:, :, 1:, 1:][:, :, :-1, :-1], y[:, :, 1:, 1:][:, :, :-1, :-1])
+    print('all ok')

annotator/lama/saicinpainting/training/modules/squeeze_excitation.py ADDED Viewed

	@@ -0,0 +1,20 @@

+import torch.nn as nn
+class SELayer(nn.Module):
+    def __init__(self, channel, reduction=16):
+        super(SELayer, self).__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.fc = nn.Sequential(
+            nn.Linear(channel, channel // reduction, bias=False),
+            nn.ReLU(inplace=True),
+            nn.Linear(channel // reduction, channel, bias=False),
+            nn.Sigmoid()
+        )
+    def forward(self, x):
+        b, c, _, _ = x.size()
+        y = self.avg_pool(x).view(b, c)
+        y = self.fc(y).view(b, c, 1, 1)
+        res = x * y.expand_as(x)
+        return res

annotator/lama/saicinpainting/training/trainers/__init__.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import logging
+import torch
+from annotator.lama.saicinpainting.training.trainers.default import DefaultInpaintingTrainingModule
+def get_training_model_class(kind):
+    if kind == 'default':
+        return DefaultInpaintingTrainingModule
+    raise ValueError(f'Unknown trainer module {kind}')
+def make_training_model(config):
+    kind = config.training_model.kind
+    kwargs = dict(config.training_model)
+    kwargs.pop('kind')
+    kwargs['use_ddp'] = config.trainer.kwargs.get('accelerator', None) == 'ddp'
+    logging.info(f'Make training model {kind}')
+    cls = get_training_model_class(kind)
+    return cls(config, **kwargs)
+def load_checkpoint(train_config, path, map_location='cuda', strict=True):
+    model = make_training_model(train_config).generator
+    state = torch.load(path, map_location=map_location)
+    model.load_state_dict(state, strict=strict)
+    return model

annotator/lama/saicinpainting/training/trainers/base.py ADDED Viewed

	@@ -0,0 +1,293 @@

+import copy
+import logging
+from typing import Dict, Tuple
+import pandas as pd
+import pytorch_lightning as ptl
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+# from torch.utils.data import DistributedSampler
+# from annotator.lama.saicinpainting.evaluation import make_evaluator
+# from annotator.lama.saicinpainting.training.data.datasets import make_default_train_dataloader, make_default_val_dataloader
+# from annotator.lama.saicinpainting.training.losses.adversarial import make_discrim_loss
+# from annotator.lama.saicinpainting.training.losses.perceptual import PerceptualLoss, ResNetPL
+from annotator.lama.saicinpainting.training.modules import make_generator  #, make_discriminator
+# from annotator.lama.saicinpainting.training.visualizers import make_visualizer
+from annotator.lama.saicinpainting.utils import add_prefix_to_keys, average_dicts, set_requires_grad, flatten_dict, \
+    get_has_ddp_rank
+LOGGER = logging.getLogger(__name__)
+def make_optimizer(parameters, kind='adamw', **kwargs):
+    if kind == 'adam':
+        optimizer_class = torch.optim.Adam
+    elif kind == 'adamw':
+        optimizer_class = torch.optim.AdamW
+    else:
+        raise ValueError(f'Unknown optimizer kind {kind}')
+    return optimizer_class(parameters, **kwargs)
+def update_running_average(result: nn.Module, new_iterate_model: nn.Module, decay=0.999):
+    with torch.no_grad():
+        res_params = dict(result.named_parameters())
+        new_params = dict(new_iterate_model.named_parameters())
+        for k in res_params.keys():
+            res_params[k].data.mul_(decay).add_(new_params[k].data, alpha=1 - decay)
+def make_multiscale_noise(base_tensor, scales=6, scale_mode='bilinear'):
+    batch_size, _, height, width = base_tensor.shape
+    cur_height, cur_width = height, width
+    result = []
+    align_corners = False if scale_mode in ('bilinear', 'bicubic') else None
+    for _ in range(scales):
+        cur_sample = torch.randn(batch_size, 1, cur_height, cur_width, device=base_tensor.device)
+        cur_sample_scaled = F.interpolate(cur_sample, size=(height, width), mode=scale_mode, align_corners=align_corners)
+        result.append(cur_sample_scaled)
+        cur_height //= 2
+        cur_width //= 2
+    return torch.cat(result, dim=1)
+class BaseInpaintingTrainingModule(ptl.LightningModule):
+    def __init__(self, config, use_ddp, *args,  predict_only=False, visualize_each_iters=100,
+                 average_generator=False, generator_avg_beta=0.999, average_generator_start_step=30000,
+                 average_generator_period=10, store_discr_outputs_for_vis=False,
+                 **kwargs):
+        super().__init__(*args, **kwargs)
+        LOGGER.info('BaseInpaintingTrainingModule init called')
+        self.config = config
+        self.generator = make_generator(config, **self.config.generator)
+        self.use_ddp = use_ddp
+        if not get_has_ddp_rank():
+            LOGGER.info(f'Generator\n{self.generator}')
+        # if not predict_only:
+        #     self.save_hyperparameters(self.config)
+        #     self.discriminator = make_discriminator(**self.config.discriminator)
+        #     self.adversarial_loss = make_discrim_loss(**self.config.losses.adversarial)
+        #     self.visualizer = make_visualizer(**self.config.visualizer)
+        #     self.val_evaluator = make_evaluator(**self.config.evaluator)
+        #     self.test_evaluator = make_evaluator(**self.config.evaluator)
+        #
+        #     if not get_has_ddp_rank():
+        #         LOGGER.info(f'Discriminator\n{self.discriminator}')
+        #
+        #     extra_val = self.config.data.get('extra_val', ())
+        #     if extra_val:
+        #         self.extra_val_titles = list(extra_val)
+        #         self.extra_evaluators = nn.ModuleDict({k: make_evaluator(**self.config.evaluator)
+        #                                                for k in extra_val})
+        #     else:
+        #         self.extra_evaluators = {}
+        #
+        #     self.average_generator = average_generator
+        #     self.generator_avg_beta = generator_avg_beta
+        #     self.average_generator_start_step = average_generator_start_step
+        #     self.average_generator_period = average_generator_period
+        #     self.generator_average = None
+        #     self.last_generator_averaging_step = -1
+        #     self.store_discr_outputs_for_vis = store_discr_outputs_for_vis
+        #
+        #     if self.config.losses.get("l1", {"weight_known": 0})['weight_known'] > 0:
+        #         self.loss_l1 = nn.L1Loss(reduction='none')
+        #
+        #     if self.config.losses.get("mse", {"weight": 0})['weight'] > 0:
+        #         self.loss_mse = nn.MSELoss(reduction='none')
+        #
+        #     if self.config.losses.perceptual.weight > 0:
+        #         self.loss_pl = PerceptualLoss()
+        #
+        #     # if self.config.losses.get("resnet_pl", {"weight": 0})['weight'] > 0:
+        #     #     self.loss_resnet_pl = ResNetPL(**self.config.losses.resnet_pl)
+        #     # else:
+        #     #     self.loss_resnet_pl = None
+        #
+        #     self.loss_resnet_pl = None
+        self.visualize_each_iters = visualize_each_iters
+        LOGGER.info('BaseInpaintingTrainingModule init done')
+    def configure_optimizers(self):
+        discriminator_params = list(self.discriminator.parameters())
+        return [
+            dict(optimizer=make_optimizer(self.generator.parameters(), **self.config.optimizers.generator)),
+            dict(optimizer=make_optimizer(discriminator_params, **self.config.optimizers.discriminator)),
+        ]
+    def train_dataloader(self):
+        kwargs = dict(self.config.data.train)
+        if self.use_ddp:
+            kwargs['ddp_kwargs'] = dict(num_replicas=self.trainer.num_nodes * self.trainer.num_processes,
+                                        rank=self.trainer.global_rank,
+                                        shuffle=True)
+        dataloader = make_default_train_dataloader(**self.config.data.train)
+        return dataloader
+    def val_dataloader(self):
+        res = [make_default_val_dataloader(**self.config.data.val)]
+        if self.config.data.visual_test is not None:
+            res = res + [make_default_val_dataloader(**self.config.data.visual_test)]
+        else:
+            res = res + res
+        extra_val = self.config.data.get('extra_val', ())
+        if extra_val:
+            res += [make_default_val_dataloader(**extra_val[k]) for k in self.extra_val_titles]
+        return res
+    def training_step(self, batch, batch_idx, optimizer_idx=None):
+        self._is_training_step = True
+        return self._do_step(batch, batch_idx, mode='train', optimizer_idx=optimizer_idx)
+    def validation_step(self, batch, batch_idx, dataloader_idx):
+        extra_val_key = None
+        if dataloader_idx == 0:
+            mode = 'val'
+        elif dataloader_idx == 1:
+            mode = 'test'
+        else:
+            mode = 'extra_val'
+            extra_val_key = self.extra_val_titles[dataloader_idx - 2]
+        self._is_training_step = False
+        return self._do_step(batch, batch_idx, mode=mode, extra_val_key=extra_val_key)
+    def training_step_end(self, batch_parts_outputs):
+        if self.training and self.average_generator \
+                and self.global_step >= self.average_generator_start_step \
+                and self.global_step >= self.last_generator_averaging_step + self.average_generator_period:
+            if self.generator_average is None:
+                self.generator_average = copy.deepcopy(self.generator)
+            else:
+                update_running_average(self.generator_average, self.generator, decay=self.generator_avg_beta)
+            self.last_generator_averaging_step = self.global_step
+        full_loss = (batch_parts_outputs['loss'].mean()
+                     if torch.is_tensor(batch_parts_outputs['loss'])  # loss is not tensor when no discriminator used
+                     else torch.tensor(batch_parts_outputs['loss']).float().requires_grad_(True))
+        log_info = {k: v.mean() for k, v in batch_parts_outputs['log_info'].items()}
+        self.log_dict(log_info, on_step=True, on_epoch=False)
+        return full_loss
+    def validation_epoch_end(self, outputs):
+        outputs = [step_out for out_group in outputs for step_out in out_group]
+        averaged_logs = average_dicts(step_out['log_info'] for step_out in outputs)
+        self.log_dict({k: v.mean() for k, v in averaged_logs.items()})
+        pd.set_option('display.max_columns', 500)
+        pd.set_option('display.width', 1000)
+        # standard validation
+        val_evaluator_states = [s['val_evaluator_state'] for s in outputs if 'val_evaluator_state' in s]
+        val_evaluator_res = self.val_evaluator.evaluation_end(states=val_evaluator_states)
+        val_evaluator_res_df = pd.DataFrame(val_evaluator_res).stack(1).unstack(0)
+        val_evaluator_res_df.dropna(axis=1, how='all', inplace=True)
+        LOGGER.info(f'Validation metrics after epoch #{self.current_epoch}, '
+                    f'total {self.global_step} iterations:\n{val_evaluator_res_df}')
+        for k, v in flatten_dict(val_evaluator_res).items():
+            self.log(f'val_{k}', v)
+        # standard visual test
+        test_evaluator_states = [s['test_evaluator_state'] for s in outputs
+                                 if 'test_evaluator_state' in s]
+        test_evaluator_res = self.test_evaluator.evaluation_end(states=test_evaluator_states)
+        test_evaluator_res_df = pd.DataFrame(test_evaluator_res).stack(1).unstack(0)
+        test_evaluator_res_df.dropna(axis=1, how='all', inplace=True)
+        LOGGER.info(f'Test metrics after epoch #{self.current_epoch}, '
+                    f'total {self.global_step} iterations:\n{test_evaluator_res_df}')
+        for k, v in flatten_dict(test_evaluator_res).items():
+            self.log(f'test_{k}', v)
+        # extra validations
+        if self.extra_evaluators:
+            for cur_eval_title, cur_evaluator in self.extra_evaluators.items():
+                cur_state_key = f'extra_val_{cur_eval_title}_evaluator_state'
+                cur_states = [s[cur_state_key] for s in outputs if cur_state_key in s]
+                cur_evaluator_res = cur_evaluator.evaluation_end(states=cur_states)
+                cur_evaluator_res_df = pd.DataFrame(cur_evaluator_res).stack(1).unstack(0)
+                cur_evaluator_res_df.dropna(axis=1, how='all', inplace=True)
+                LOGGER.info(f'Extra val {cur_eval_title} metrics after epoch #{self.current_epoch}, '
+                            f'total {self.global_step} iterations:\n{cur_evaluator_res_df}')
+                for k, v in flatten_dict(cur_evaluator_res).items():
+                    self.log(f'extra_val_{cur_eval_title}_{k}', v)
+    def _do_step(self, batch, batch_idx, mode='train', optimizer_idx=None, extra_val_key=None):
+        if optimizer_idx == 0:  # step for generator
+            set_requires_grad(self.generator, True)
+            set_requires_grad(self.discriminator, False)
+        elif optimizer_idx == 1:  # step for discriminator
+            set_requires_grad(self.generator, False)
+            set_requires_grad(self.discriminator, True)
+        batch = self(batch)
+        total_loss = 0
+        metrics = {}
+        if optimizer_idx is None or optimizer_idx == 0:  # step for generator
+            total_loss, metrics = self.generator_loss(batch)
+        elif optimizer_idx is None or optimizer_idx == 1:  # step for discriminator
+            if self.config.losses.adversarial.weight > 0:
+                total_loss, metrics = self.discriminator_loss(batch)
+        if self.get_ddp_rank() in (None, 0) and (batch_idx % self.visualize_each_iters == 0 or mode == 'test'):
+            if self.config.losses.adversarial.weight > 0:
+                if self.store_discr_outputs_for_vis:
+                    with torch.no_grad():
+                        self.store_discr_outputs(batch)
+            vis_suffix = f'_{mode}'
+            if mode == 'extra_val':
+                vis_suffix += f'_{extra_val_key}'
+            self.visualizer(self.current_epoch, batch_idx, batch, suffix=vis_suffix)
+        metrics_prefix = f'{mode}_'
+        if mode == 'extra_val':
+            metrics_prefix += f'{extra_val_key}_'
+        result = dict(loss=total_loss, log_info=add_prefix_to_keys(metrics, metrics_prefix))
+        if mode == 'val':
+            result['val_evaluator_state'] = self.val_evaluator.process_batch(batch)
+        elif mode == 'test':
+            result['test_evaluator_state'] = self.test_evaluator.process_batch(batch)
+        elif mode == 'extra_val':
+            result[f'extra_val_{extra_val_key}_evaluator_state'] = self.extra_evaluators[extra_val_key].process_batch(batch)
+        return result
+    def get_current_generator(self, no_average=False):
+        if not no_average and not self.training and self.average_generator and self.generator_average is not None:
+            return self.generator_average
+        return self.generator
+    def forward(self, batch: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+        """Pass data through generator and obtain at leas 'predicted_image' and 'inpainted' keys"""
+        raise NotImplementedError()
+    def generator_loss(self, batch) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
+        raise NotImplementedError()
+    def discriminator_loss(self, batch) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
+        raise NotImplementedError()
+    def store_discr_outputs(self, batch):
+        out_size = batch['image'].shape[2:]
+        discr_real_out, _ = self.discriminator(batch['image'])
+        discr_fake_out, _ = self.discriminator(batch['predicted_image'])
+        batch['discr_output_real'] = F.interpolate(discr_real_out, size=out_size, mode='nearest')
+        batch['discr_output_fake'] = F.interpolate(discr_fake_out, size=out_size, mode='nearest')
+        batch['discr_output_diff'] = batch['discr_output_real'] - batch['discr_output_fake']
+    def get_ddp_rank(self):
+        return self.trainer.global_rank if (self.trainer.num_nodes * self.trainer.num_processes) > 1 else None

annotator/lama/saicinpainting/training/trainers/default.py ADDED Viewed

	@@ -0,0 +1,175 @@

+import logging
+import torch
+import torch.nn.functional as F
+from omegaconf import OmegaConf
+# from annotator.lama.saicinpainting.training.data.datasets import make_constant_area_crop_params
+from annotator.lama.saicinpainting.training.losses.distance_weighting import make_mask_distance_weighter
+from annotator.lama.saicinpainting.training.losses.feature_matching import feature_matching_loss, masked_l1_loss
+# from annotator.lama.saicinpainting.training.modules.fake_fakes import FakeFakesGenerator
+from annotator.lama.saicinpainting.training.trainers.base import BaseInpaintingTrainingModule, make_multiscale_noise
+from annotator.lama.saicinpainting.utils import add_prefix_to_keys, get_ramp
+LOGGER = logging.getLogger(__name__)
+def make_constant_area_crop_batch(batch, **kwargs):
+    crop_y, crop_x, crop_height, crop_width = make_constant_area_crop_params(img_height=batch['image'].shape[2],
+                                                                             img_width=batch['image'].shape[3],
+                                                                             **kwargs)
+    batch['image'] = batch['image'][:, :, crop_y : crop_y + crop_height, crop_x : crop_x + crop_width]
+    batch['mask'] = batch['mask'][:, :, crop_y: crop_y + crop_height, crop_x: crop_x + crop_width]
+    return batch
+class DefaultInpaintingTrainingModule(BaseInpaintingTrainingModule):
+    def __init__(self, *args, concat_mask=True, rescale_scheduler_kwargs=None, image_to_discriminator='predicted_image',
+                 add_noise_kwargs=None, noise_fill_hole=False, const_area_crop_kwargs=None,
+                 distance_weighter_kwargs=None, distance_weighted_mask_for_discr=False,
+                 fake_fakes_proba=0, fake_fakes_generator_kwargs=None,
+                 **kwargs):
+        super().__init__(*args, **kwargs)
+        self.concat_mask = concat_mask
+        self.rescale_size_getter = get_ramp(**rescale_scheduler_kwargs) if rescale_scheduler_kwargs is not None else None
+        self.image_to_discriminator = image_to_discriminator
+        self.add_noise_kwargs = add_noise_kwargs
+        self.noise_fill_hole = noise_fill_hole
+        self.const_area_crop_kwargs = const_area_crop_kwargs
+        self.refine_mask_for_losses = make_mask_distance_weighter(**distance_weighter_kwargs) \
+            if distance_weighter_kwargs is not None else None
+        self.distance_weighted_mask_for_discr = distance_weighted_mask_for_discr
+        self.fake_fakes_proba = fake_fakes_proba
+        if self.fake_fakes_proba > 1e-3:
+            self.fake_fakes_gen = FakeFakesGenerator(**(fake_fakes_generator_kwargs or {}))
+    def forward(self, batch):
+        if self.training and self.rescale_size_getter is not None:
+            cur_size = self.rescale_size_getter(self.global_step)
+            batch['image'] = F.interpolate(batch['image'], size=cur_size, mode='bilinear', align_corners=False)
+            batch['mask'] = F.interpolate(batch['mask'], size=cur_size, mode='nearest')
+        if self.training and self.const_area_crop_kwargs is not None:
+            batch = make_constant_area_crop_batch(batch, **self.const_area_crop_kwargs)
+        img = batch['image']
+        mask = batch['mask']
+        masked_img = img * (1 - mask)
+        if self.add_noise_kwargs is not None:
+            noise = make_multiscale_noise(masked_img, **self.add_noise_kwargs)
+            if self.noise_fill_hole:
+                masked_img = masked_img + mask * noise[:, :masked_img.shape[1]]
+            masked_img = torch.cat([masked_img, noise], dim=1)
+        if self.concat_mask:
+            masked_img = torch.cat([masked_img, mask], dim=1)
+        batch['predicted_image'] = self.generator(masked_img)
+        batch['inpainted'] = mask * batch['predicted_image'] + (1 - mask) * batch['image']
+        if self.fake_fakes_proba > 1e-3:
+            if self.training and torch.rand(1).item() < self.fake_fakes_proba:
+                batch['fake_fakes'], batch['fake_fakes_masks'] = self.fake_fakes_gen(img, mask)
+                batch['use_fake_fakes'] = True
+            else:
+                batch['fake_fakes'] = torch.zeros_like(img)
+                batch['fake_fakes_masks'] = torch.zeros_like(mask)
+                batch['use_fake_fakes'] = False
+        batch['mask_for_losses'] = self.refine_mask_for_losses(img, batch['predicted_image'], mask) \
+            if self.refine_mask_for_losses is not None and self.training \
+            else mask
+        return batch
+    def generator_loss(self, batch):
+        img = batch['image']
+        predicted_img = batch[self.image_to_discriminator]
+        original_mask = batch['mask']
+        supervised_mask = batch['mask_for_losses']
+        # L1
+        l1_value = masked_l1_loss(predicted_img, img, supervised_mask,
+                                  self.config.losses.l1.weight_known,
+                                  self.config.losses.l1.weight_missing)
+        total_loss = l1_value
+        metrics = dict(gen_l1=l1_value)
+        # vgg-based perceptual loss
+        if self.config.losses.perceptual.weight > 0:
+            pl_value = self.loss_pl(predicted_img, img, mask=supervised_mask).sum() * self.config.losses.perceptual.weight
+            total_loss = total_loss + pl_value
+            metrics['gen_pl'] = pl_value
+        # discriminator
+        # adversarial_loss calls backward by itself
+        mask_for_discr = supervised_mask if self.distance_weighted_mask_for_discr else original_mask
+        self.adversarial_loss.pre_generator_step(real_batch=img, fake_batch=predicted_img,
+                                                 generator=self.generator, discriminator=self.discriminator)
+        discr_real_pred, discr_real_features = self.discriminator(img)
+        discr_fake_pred, discr_fake_features = self.discriminator(predicted_img)
+        adv_gen_loss, adv_metrics = self.adversarial_loss.generator_loss(real_batch=img,
+                                                                         fake_batch=predicted_img,
+                                                                         discr_real_pred=discr_real_pred,
+                                                                         discr_fake_pred=discr_fake_pred,
+                                                                         mask=mask_for_discr)
+        total_loss = total_loss + adv_gen_loss
+        metrics['gen_adv'] = adv_gen_loss
+        metrics.update(add_prefix_to_keys(adv_metrics, 'adv_'))
+        # feature matching
+        if self.config.losses.feature_matching.weight > 0:
+            need_mask_in_fm = OmegaConf.to_container(self.config.losses.feature_matching).get('pass_mask', False)
+            mask_for_fm = supervised_mask if need_mask_in_fm else None
+            fm_value = feature_matching_loss(discr_fake_features, discr_real_features,
+                                             mask=mask_for_fm) * self.config.losses.feature_matching.weight
+            total_loss = total_loss + fm_value
+            metrics['gen_fm'] = fm_value
+        if self.loss_resnet_pl is not None:
+            resnet_pl_value = self.loss_resnet_pl(predicted_img, img)
+            total_loss = total_loss + resnet_pl_value
+            metrics['gen_resnet_pl'] = resnet_pl_value
+        return total_loss, metrics
+    def discriminator_loss(self, batch):
+        total_loss = 0
+        metrics = {}
+        predicted_img = batch[self.image_to_discriminator].detach()
+        self.adversarial_loss.pre_discriminator_step(real_batch=batch['image'], fake_batch=predicted_img,
+                                                     generator=self.generator, discriminator=self.discriminator)
+        discr_real_pred, discr_real_features = self.discriminator(batch['image'])
+        discr_fake_pred, discr_fake_features = self.discriminator(predicted_img)
+        adv_discr_loss, adv_metrics = self.adversarial_loss.discriminator_loss(real_batch=batch['image'],
+                                                                               fake_batch=predicted_img,
+                                                                               discr_real_pred=discr_real_pred,
+                                                                               discr_fake_pred=discr_fake_pred,
+                                                                               mask=batch['mask'])
+        total_loss = total_loss + adv_discr_loss
+        metrics['discr_adv'] = adv_discr_loss
+        metrics.update(add_prefix_to_keys(adv_metrics, 'adv_'))
+        if batch.get('use_fake_fakes', False):
+            fake_fakes = batch['fake_fakes']
+            self.adversarial_loss.pre_discriminator_step(real_batch=batch['image'], fake_batch=fake_fakes,
+                                                         generator=self.generator, discriminator=self.discriminator)
+            discr_fake_fakes_pred, _ = self.discriminator(fake_fakes)
+            fake_fakes_adv_discr_loss, fake_fakes_adv_metrics = self.adversarial_loss.discriminator_loss(
+                real_batch=batch['image'],
+                fake_batch=fake_fakes,
+                discr_real_pred=discr_real_pred,
+                discr_fake_pred=discr_fake_fakes_pred,
+                mask=batch['mask']
+            )
+            total_loss = total_loss + fake_fakes_adv_discr_loss
+            metrics['discr_adv_fake_fakes'] = fake_fakes_adv_discr_loss
+            metrics.update(add_prefix_to_keys(fake_fakes_adv_metrics, 'adv_'))
+        return total_loss, metrics

annotator/lama/saicinpainting/training/visualizers/__init__.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import logging
+from annotator.lama.saicinpainting.training.visualizers.directory import DirectoryVisualizer
+from annotator.lama.saicinpainting.training.visualizers.noop import NoopVisualizer
+def make_visualizer(kind, **kwargs):
+    logging.info(f'Make visualizer {kind}')
+    if kind == 'directory':
+        return DirectoryVisualizer(**kwargs)
+    if kind == 'noop':
+        return NoopVisualizer()
+    raise ValueError(f'Unknown visualizer kind {kind}')

annotator/lama/saicinpainting/training/visualizers/base.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import abc
+from typing import Dict, List
+import numpy as np
+import torch
+from skimage import color
+from skimage.segmentation import mark_boundaries
+from . import colors
+COLORS, _ = colors.generate_colors(151) # 151 - max classes for semantic segmentation
+class BaseVisualizer:
+    @abc.abstractmethod
+    def __call__(self, epoch_i, batch_i, batch, suffix='', rank=None):
+        """
+        Take a batch, make an image from it and visualize
+        """
+        raise NotImplementedError()
+def visualize_mask_and_images(images_dict: Dict[str, np.ndarray], keys: List[str],
+                              last_without_mask=True, rescale_keys=None, mask_only_first=None,
+                              black_mask=False) -> np.ndarray:
+    mask = images_dict['mask'] > 0.5
+    result = []
+    for i, k in enumerate(keys):
+        img = images_dict[k]
+        img = np.transpose(img, (1, 2, 0))
+        if rescale_keys is not None and k in rescale_keys:
+            img = img - img.min()
+            img /= img.max() + 1e-5
+        if len(img.shape) == 2:
+            img = np.expand_dims(img, 2)
+        if img.shape[2] == 1:
+            img = np.repeat(img, 3, axis=2)
+        elif (img.shape[2] > 3):
+            img_classes = img.argmax(2)
+            img = color.label2rgb(img_classes, colors=COLORS)
+        if mask_only_first:
+            need_mark_boundaries = i == 0
+        else:
+            need_mark_boundaries = i < len(keys) - 1 or not last_without_mask
+        if need_mark_boundaries:
+            if black_mask:
+                img = img * (1 - mask[0][..., None])
+            img = mark_boundaries(img,
+                                  mask[0],
+                                  color=(1., 0., 0.),
+                                  outline_color=(1., 1., 1.),
+                                  mode='thick')
+        result.append(img)
+    return np.concatenate(result, axis=1)
+def visualize_mask_and_images_batch(batch: Dict[str, torch.Tensor], keys: List[str], max_items=10,
+                                    last_without_mask=True, rescale_keys=None) -> np.ndarray:
+    batch = {k: tens.detach().cpu().numpy() for k, tens in batch.items()
+             if k in keys or k == 'mask'}
+    batch_size = next(iter(batch.values())).shape[0]
+    items_to_vis = min(batch_size, max_items)
+    result = []
+    for i in range(items_to_vis):
+        cur_dct = {k: tens[i] for k, tens in batch.items()}
+        result.append(visualize_mask_and_images(cur_dct, keys, last_without_mask=last_without_mask,
+                                                rescale_keys=rescale_keys))
+    return np.concatenate(result, axis=0)