vera6 commited on Oct 28, 2025

Commit

463545c

verified ·

1 Parent(s): d3b2701

Upload folder using huggingface_hub

Browse files

Files changed (34) hide show

.gitattributes +2 -35
.gitignore +162 -162
Dockerfile +32 -32
LICENSE +21 -34
README.md +30 -1
app/app.py +116 -137
app/miner_4.ckpt +2 -2
app/run.py +7 -13
pyproject.toml +57 -58
sgmse/backbones/__init__.py +7 -7
sgmse/backbones/dcunet.py +627 -627
sgmse/backbones/ncsnpp.py +419 -420
sgmse/backbones/ncsnpp_48k.py +424 -424
sgmse/backbones/ncsnpp_utils/layers.py +661 -661
sgmse/backbones/ncsnpp_utils/layerspp.py +274 -274
sgmse/backbones/ncsnpp_utils/normalization.py +215 -215
sgmse/backbones/ncsnpp_utils/op/__init__.py +1 -2
sgmse/backbones/ncsnpp_utils/op/fused_act.py +97 -0
sgmse/backbones/ncsnpp_utils/op/fused_bias_act.cpp +21 -0
sgmse/backbones/ncsnpp_utils/op/fused_bias_act_kernel.cu +99 -0
sgmse/backbones/ncsnpp_utils/up_or_down_sampling.py +257 -257
sgmse/backbones/ncsnpp_utils/utils.py +188 -188
sgmse/backbones/ncsnpp_v2.py +395 -395
sgmse/backbones/shared.py +123 -123
sgmse/data_module.py +236 -236
sgmse/model.py +471 -468
sgmse/sampling/__init__.py +249 -249
sgmse/sampling/correctors.py +96 -94
sgmse/sampling/predictors.py +76 -76
sgmse/sdes.py +391 -312
sgmse/util/inference.py +64 -64
sgmse/util/other.py +140 -140
sgmse/util/registry.py +34 -34
sgmse/util/tensors.py +16 -16

.gitattributes CHANGED Viewed

@@ -1,35 +1,2 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text


1	+ app/1510dnpnr-15K.ckpt filter=lfs diff=lfs merge=lfs -text
2	+ app/*.ckpt filter=lfs diff=lfs merge=lfs -text

.gitignore CHANGED Viewed

@@ -1,162 +1,162 @@
-# Byte-compiled / optimized / DLL files
-__pycache__/
-*.py[cod]
-*$py.class
-# C extensions
-*.so
-# Distribution / packaging
-.Python
-build/
-develop-eggs/
-dist/
-downloads/
-eggs/
-.eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-wheels/
-share/python-wheels/
-*.egg-info/
-.installed.cfg
-*.egg
-MANIFEST
-# PyInstaller
-#  Usually these files are written by a python script from a template
-#  before PyInstaller builds the exe, so as to inject date/other infos into it.
-*.manifest
-*.spec
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-# Unit test / coverage reports
-htmlcov/
-.tox/
-.nox/
-.coverage
-.coverage.*
-.cache
-nosetests.xml
-coverage.xml
-*.cover
-*.py,cover
-.hypothesis/
-.pytest_cache/
-cover/
-# Translations
-*.mo
-*.pot
-# Django stuff:
-*.log
-local_settings.py
-db.sqlite3
-db.sqlite3-journal
-# Flask stuff:
-instance/
-.webassets-cache
-# Scrapy stuff:
-.scrapy
-# Sphinx documentation
-docs/_build/
-# PyBuilder
-.pybuilder/
-target/
-# Jupyter Notebook
-.ipynb_checkpoints
-# IPython
-profile_default/
-ipython_config.py
-# pyenv
-#   For a library or package, you might want to ignore these files since the code is
-#   intended to run in multiple environments; otherwise, check them in:
-# .python-version
-# pipenv
-#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
-#   However, in case of collaboration, if having platform-specific dependencies or dependencies
-#   having no cross-platform support, pipenv may install dependencies that don't work, or not
-#   install all needed dependencies.
-#Pipfile.lock
-# poetry
-#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
-#   This is especially recommended for binary packages to ensure reproducibility, and is more
-#   commonly ignored for libraries.
-#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
-#poetry.lock
-# pdm
-#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
-#pdm.lock
-#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
-#   in version control.
-#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
-.pdm.toml
-.pdm-python
-.pdm-build/
-# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
-__pypackages__/
-# Celery stuff
-celerybeat-schedule
-celerybeat.pid
-# SageMath parsed files
-*.sage.py
-# Environments
-.env
-.venv
-env/
-venv/
-ENV/
-env.bak/
-venv.bak/
-# Spyder project settings
-.spyderproject
-.spyproject
-# Rope project settings
-.ropeproject
-# mkdocs documentation
-/site
-# mypy
-.mypy_cache/
-.dmypy.json
-dmypy.json
-# Pyre type checker
-.pyre/
-# pytype static type analyzer
-.pytype/
-# Cython debug symbols
-cython_debug/
-# PyCharm
-#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
-#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
-#  and can be added to the global gitignore or merged into this file.  For a more nuclear
-#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-#.idea/

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

Dockerfile CHANGED Viewed

@@ -1,33 +1,33 @@
-FROM python:3.10.14-bookworm
-ARG USER_UID=10002
-ARG USER_GID=$USER_UID
-ARG USERNAME=modelapi
-RUN groupadd --gid $USER_GID $USERNAME \
-&& useradd --uid $USER_UID --gid $USER_GID -m $USERNAME
-# Copy required files
-RUN mkdir -p /modelapi && mkdir -p /home/$USERNAME/.modelapi
-COPY app /modelapi/app
-COPY sgmse /modelapi/sgmse
-COPY pyproject.toml /modelapi/pyproject.toml
-ENV CUDA_HOME=/usr/local/cuda-12.6
-# Setup permissions
-RUN chown -R $USER_UID:$USER_GID /modelapi \
-&& chown -R $USER_UID:$USER_GID /home/$USERNAME/.modelapi \
-&& chown -R $USER_UID:$USER_GID /home/$USERNAME \
-&& chmod -R 755 /home/$USERNAME \
-&& chmod -R 755 /modelapi \
-&& chmod -R 755 /home/$USERNAME/.modelapi
-# Change to the user and do subnet installation
-USER $USERNAME
-RUN /bin/bash -c "python3 -m venv /modelapi/.venv && source /modelapi/.venv/bin/activate && pip3 install -e /modelapi/."
-EXPOSE 6500
 CMD ["/bin/bash", "-c", "source /modelapi/.venv/bin/activate && python3 /modelapi/app/run.py"]

+FROM python:3.10.14-bookworm
+ARG USER_UID=10002
+ARG USER_GID=$USER_UID
+ARG USERNAME=modelapi
+RUN groupadd --gid $USER_GID $USERNAME \
+&& useradd --uid $USER_UID --gid $USER_GID -m $USERNAME
+# Copy required files
+RUN mkdir -p /modelapi && mkdir -p /home/$USERNAME/.modelapi
+COPY app /modelapi/app
+COPY sgmse /modelapi/sgmse
+COPY pyproject.toml /modelapi/pyproject.toml
+ENV CUDA_HOME=/usr/local/cuda-12.6
+# Setup permissions
+RUN chown -R $USER_UID:$USER_GID /modelapi \
+&& chown -R $USER_UID:$USER_GID /home/$USERNAME/.modelapi \
+&& chown -R $USER_UID:$USER_GID /home/$USERNAME \
+&& chmod -R 755 /home/$USERNAME \
+&& chmod -R 755 /modelapi \
+&& chmod -R 755 /home/$USERNAME/.modelapi
+# Change to the user
+USER $USERNAME
+RUN /bin/bash -c "python3 -m venv /modelapi/.venv && source /modelapi/.venv/bin/activate && pip3 install -e /modelapi/."
+EXPOSE 6500
 CMD ["/bin/bash", "-c", "source /modelapi/.venv/bin/activate && python3 /modelapi/app/run.py"]

LICENSE CHANGED Viewed

@@ -1,34 +1,21 @@
-MIT License
-Copyright (c) 2024 synapsec.ai
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
----
-### Third-Party Code:
-Portions of this software are derived from code in the following project(s):
-- speech-enhancement-sgmse by sp-uhh (MIT License)
-  - Repository: https://huggingface.co/sp-uhh/speech-enhancement-sgmse
-  - Copyright (c) 2022 Signal Processing (SP), Universität Hamburg
-  - Licensed under the MIT License (included in the `THIRD_PARTY_LICENSES` file)
----

+MIT License
+Copyright (c) 2024 synapsec.ai
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

	@@ -1 +1,30 @@
1	- ~~DENOISING~~ ~~speech~~ ~~enhancement~~ ~~model~~

+# Container Template for SoundsRight Subnet Miners
+Miners in [Bittensor's](https://bittensor.com/) [SoundsRight Subnet](https://github.com/synapsec-ai/soundsright-subnet) must containerize their models before uploading to HuggingFace. This repo serves as a template.
+The branches `DENOISING_16000HZ` and `DEREVERBERATION_16000HZ` contain this template fitted with [SGMSE+](https://huggingface.co/sp-uhh/speech-enhancement-sgmse) for 16 kHz speech enhancement, and the branches `DENOISING_48000HZ` and `DEREVERBERATION_48000HZ` are fitted with [SGMSE+](https://huggingface.co/sp-uhh/speech-enhancement-sgmse) for 48 kHz speech enhancement for the denoising and dereverberation tasks. These are also helpful resources for how to incorporate your model.
+The `main` branch contains a template for a container that will spin up an API to communicate with the validator. The following entrypoints cannot be altered:
+1. `/status/` : Communicates API status
+2. `/prepare/` : Makes necessary preparations (downloading checkpoints, etc.) and initializes model
+3. `/upload-audio/` : Upload audio files, save to noisy audio directory
+4. `/enhance/` : Initialize model, enhance audio files, save to enhanced audio directory
+5. `/download-enhanced/` : Download enhanced audio files
+6. `/reset/` : Remove all existing audio files for another batch of enhancement
+To add your own model to this template, there are a few things that a miner must do:
+1. Add the model files under the `model` directory.
+2. Modify the `modelapi.prepare` method in `app/app.py` with necessary preparations to initialize your model.
+3. Modify the `modelapi.enhance` method in `app/app.py` with the logic your model uses to enhance audio.
+4. Update `dependencies` in `pyproject.toml` with the dependencies used by your model.
+5. If you have directories other than `app` in your repository, be sure to modify the `Dockerfile` accordingly (reference line 12 in the `Dockerfile` for how to do this).
+6. Cite your sources (if applicable).
+For your model to be processed by validators, there are a few formatting requirements. Note that the template already has been formatted to fit these guidelines.
+1. API endpoints must as outlined above.
+2. Port must be 6500.
+3. Container must be configured to run as non-root user.
+4. Container is not reliant on having network access to function as intended.

app/app.py CHANGED Viewed

@@ -1,213 +1,205 @@
-import fastapi
-import shutil
-import os
-import zipfile
-import io
-import uvicorn
-import threading
 import glob
-from typing import List
 import torch
 import gdown
 from soundfile import write
 from torchaudio import load
 from librosa import resample
 import logging
 logging.basicConfig(level=logging.DEBUG)
 from sgmse import ScoreModel
 from sgmse.util.other import pad_spec
 class ModelAPI:
     def __init__(self, host, port):
-        self.host = host
         self.port = port
         self.base_path = os.path.join(os.path.expanduser("~"), ".modelapi")
         self.noisy_audio_path = os.path.join(self.base_path, "noisy_audio")
         self.enhanced_audio_path = os.path.join(self.base_path, "enhanced_audio")
         app_dir = os.path.dirname(os.path.abspath(__file__))
-        self.ckpt_path = glob.glob(os.path.join(app_dir, "*.ckpt"))[0]
-        self.device = "cuda" if torch.cuda.is_available() else "cpu"
         self.corrector = "ald"
         self.corrector_steps = 1
         self.snr = 0.5
         self.N = 30
         for audio_path in [self.noisy_audio_path, self.enhanced_audio_path]:
             if not os.path.exists(audio_path):
                 os.makedirs(audio_path)
             for filename in os.listdir(audio_path):
                 file_path = os.path.join(audio_path, filename)
                 try:
                     if os.path.isfile(file_path) or os.path.islink(file_path):
-                        os.unlink(file_path)
                     elif os.path.isdir(file_path):
-                        shutil.rmtree(file_path)
                 except Exception as e:
                     raise e
         self.app = fastapi.FastAPI()
         self._setup_routes()
     def _prepare(self):
         """Miners should modify this function to fit their fine-tuned models.
         This function will make any preparations necessary to initialize the
         speech enhancement model (i.e. downloading checkpoint files, etc.)
         """
         self.model = ScoreModel.load_from_checkpoint(self.ckpt_path, self.device)
         self.model.t_eps = 0.03
         self.model.eval()
     def _enhance(self):
         """
         Miners should modify this function to fit their fine-tuned models.
         This function will:
         1. Open each noisy .wav file
-        2. Enhance the audio with the model
-        3. Save the enhanced audio in .wav format to ModelAPI.enhanced_audio_path
         """
-        if self.model.backbone == "ncsnpp_48k":
             target_sr = 48000
             pad_mode = "reflection"
-        elif self.model.backbone == "ncsnpp_v2":
             target_sr = 16000
             pad_mode = "reflection"
-            print("using ncsnpp_v2")
         else:
             target_sr = 16000
             pad_mode = "zero_pad"
-        noisy_files = sorted(glob.glob(os.path.join(self.noisy_audio_path, "*.wav")))
-        for noisy_file in noisy_files:
             filename = noisy_file.replace(self.noisy_audio_path, "")
             filename = filename[1:] if filename.startswith("/") else filename
             y, sr = load(noisy_file)
             if sr != target_sr:
                 y = torch.tensor(resample(y.numpy(), orig_sr=sr, target_sr=target_sr))
-            T_orig = y.size(1)
             # Normalize
             norm_factor = y.abs().max()
             y = y / norm_factor
             # Prepare DNN input
-            Y = torch.unsqueeze(
-                self.model._forward_transform(self.model._stft(y.to(self.device))), 0
-            )
             Y = pad_spec(Y, mode=pad_mode)
             # Reverse sampling
-            if self.model.sde.__class__.__name__ == "OUVESDE":
-                if self.model.sde.sampler_type == "pc":
-                    sampler = self.model.get_pc_sampler(
-                        "reverse_diffusion",
-                        self.corrector,
-                        Y.to(self.device),
-                        N=self.N,
-                        corrector_steps=self.corrector_steps,
-                        snr=self.snr,
-                    )
-                elif self.model.sde.sampler_type == "ode":
                     sampler = self.model.get_ode_sampler(Y.to(self.device), N=self.N)
                 else:
-                    raise ValueError(f"Sampler type {args.sampler_type} not supported")
-            elif self.model.sde.__class__.__name__ == "SBVESDE":
-                sampler_type = (
-                    "ode"
-                    if self.model.sde.sampler_type == "pc"
-                    else self.model.sde.sampler_type
-                )
-                sampler = self.model.get_sb_sampler(
-                    sde=self.model.sde, y=Y.cuda(), sampler_type=sampler_type
-                )
             else:
-                raise ValueError(
-                    f"SDE {self.model.sde.__class__.__name__} not supported"
-                )
             sample, _ = sampler()
             x_hat = self.model.to_audio(sample.squeeze(), T_orig)
             x_hat = x_hat * norm_factor
-            os.makedirs(
-                os.path.dirname(os.path.join(self.enhanced_audio_path, filename)),
-                exist_ok=True,
-            )
-            write(
-                os.path.join(self.enhanced_audio_path, filename),
-                x_hat.cpu().numpy(),
-                target_sr,
-            )
     def _setup_routes(self):
         self.app.get("/status/")(self.get_status)
         self.app.post("/prepare/")(self.prepare)
         self.app.post("/upload-audio/")(self.upload_audio)
         self.app.post("/enhance/")(self.enhance_audio)
         self.app.get("/download-enhanced/")(self.download_enhanced)
         self.app.post("/reset/")(self.reset)
     def get_status(self):
         try:
             return {"container_running": True}
         except Exception as e:
             logging.error(f"Error getting status: {e}")
-            raise fastapi.HTTPException(
-                status_code=500, detail="An error occurred while fetching API status."
-            )
     def prepare(self):
         try:
             self._prepare()
-            return {"preparations": True}
         except Exception as e:
             logging.error(f"Error during preparations: {e}")
-            return fastapi.HTTPException(
-                status_code=500, detail="An error occurred while fetching API status."
-            )
     def upload_audio(self, files: List[fastapi.UploadFile] = fastapi.File(...)):
         uploaded_files = []
         for file in files:
-            try:
                 file_path = os.path.join(self.noisy_audio_path, file.filename)
                 with open(file_path, "wb") as f:
-                    while contents := file.file.read(1024 * 1024):
                         f.write(contents)
-                uploaded_files.append(file.filename)
             except Exception as e:
-                logging.error(f"Error uploading files: {e}")
-                raise fastapi.HTTPException(
-                    status_code=500,
-                    detail="An error occurred while uploading the noisy files.",
-                )
             finally:
                 file.file.close()
         print(f"uploaded files: {uploaded_files}")
         return {"uploaded_files": uploaded_files, "status": True}
     def enhance_audio(self):
@@ -215,44 +207,39 @@ class ModelAPI:
             # Enhance audio
             self._enhance()
             # Obtain list of file paths for enhanced audio
-            wav_files = glob.glob(os.path.join(self.enhanced_audio_path, "*.wav"))
             # Extract just the file names
             enhanced_files = [os.path.basename(file) for file in wav_files]
             return {"status": True}
         except Exception as e:
             print(f"Exception occured during enhancement: {e}")
-            raise fastapi.HTTPException(
-                status_code=500,
-                detail="An error occurred while enhancing the noisy files.",
-            )
     def download_enhanced(self):
         try:
             zip_buffer = io.BytesIO()
             with zipfile.ZipFile(zip_buffer, "w") as zip_file:
-                for wav_file in glob.glob(
-                    os.path.join(self.enhanced_audio_path, "*.wav")
-                ):
                     zip_file.write(wav_file, arcname=os.path.basename(wav_file))
             zip_buffer.seek(0)
             return fastapi.responses.StreamingResponse(
                 iter([zip_buffer.getvalue()]),  # Stream the in-memory content
                 media_type="application/zip",
-                headers={
-                    "Content-Disposition": "attachment; filename=enhanced_audio_files.zip"
-                },
             )
         except Exception as e:
             logging.error(f"Error during enhanced files download: {e}")
-            raise fastapi.HTTPException(
-                status_code=500,
-                detail=f"An error occurred while creating the download file: {str(e)}",
-            )
     def reset(self):
         """
         Removes all audio files in preparation for another batch of enhancement.
@@ -268,17 +255,9 @@ class ModelAPI:
                         os.remove(filepath)
                     except Exception as e:
                         print(f"Error removing {filepath}: {e}")
-                        return {
-                            "status": False,
-                            "noisy": os.listdir(self.noisy_audio_path),
-                            "enhanced": os.listdir(self.enhanced_audio_path),
-                        }
-        return {
-            "status": True,
-            "noisy": os.listdir(self.noisy_audio_path),
-            "enhanced": os.listdir(self.enhanced_audio_path),
-        }
     def run(self):
-        uvicorn.run(self.app, host=self.host, port=self.port)

+import fastapi
+import shutil
+import os
+import zipfile
+import io
+import uvicorn
 import glob
+from typing import List
 import torch
 import gdown
 from soundfile import write
 from torchaudio import load
 from librosa import resample
 import logging
 logging.basicConfig(level=logging.DEBUG)
 from sgmse import ScoreModel
 from sgmse.util.other import pad_spec
 class ModelAPI:
     def __init__(self, host, port):
+        self.host = host
         self.port = port
         self.base_path = os.path.join(os.path.expanduser("~"), ".modelapi")
         self.noisy_audio_path = os.path.join(self.base_path, "noisy_audio")
         self.enhanced_audio_path = os.path.join(self.base_path, "enhanced_audio")
         app_dir = os.path.dirname(os.path.abspath(__file__))
+        ckpt_files = glob.glob(os.path.join(app_dir, "*.ckpt"))
+        if not ckpt_files:
+            raise FileNotFoundError("No .ckpt file found in app_dir.")
+        elif len(ckpt_files) > 1:
+            raise RuntimeError("Multiple .ckpt files found in app_dir. Please keep only one.")
+        else:
+            self.ckpt_path = ckpt_files[0]
+        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
         self.corrector = "ald"
         self.corrector_steps = 1
         self.snr = 0.5
         self.N = 30
+        # Create directories if they do not exist
         for audio_path in [self.noisy_audio_path, self.enhanced_audio_path]:
             if not os.path.exists(audio_path):
                 os.makedirs(audio_path)
+            # Loop through all the files and subdirectories in the directory
             for filename in os.listdir(audio_path):
                 file_path = os.path.join(audio_path, filename)
+                # Check if it's a file or directory and remove accordingly
                 try:
                     if os.path.isfile(file_path) or os.path.islink(file_path):
+                        os.unlink(file_path)  # Remove the file or link
                     elif os.path.isdir(file_path):
+                        shutil.rmtree(file_path)  # Remove the directory and its contents
                 except Exception as e:
                     raise e
         self.app = fastapi.FastAPI()
         self._setup_routes()
     def _prepare(self):
         """Miners should modify this function to fit their fine-tuned models.
         This function will make any preparations necessary to initialize the
         speech enhancement model (i.e. downloading checkpoint files, etc.)
         """
+        # Initialize model
         self.model = ScoreModel.load_from_checkpoint(self.ckpt_path, self.device)
         self.model.t_eps = 0.03
         self.model.eval()
     def _enhance(self):
         """
         Miners should modify this function to fit their fine-tuned models.
         This function will:
         1. Open each noisy .wav file
+        2. Enhance the audio with the model
+        3. Save the enhanced audio in .wav format to MinerAPI.enhanced_audio_path
         """
+        # Check if the model is trained on 48 kHz data
+        if self.model.backbone == 'ncsnpp_48k':
             target_sr = 48000
             pad_mode = "reflection"
+        elif self.model.backbone == 'ncsnpp_v2':
             target_sr = 16000
             pad_mode = "reflection"
         else:
             target_sr = 16000
             pad_mode = "zero_pad"
+        # Define file paths for all noisy files to be enhanced
+        noisy_files = sorted(glob.glob(os.path.join(self.noisy_audio_path, '*.wav')))
+        for noisy_file in noisy_files:
             filename = noisy_file.replace(self.noisy_audio_path, "")
             filename = filename[1:] if filename.startswith("/") else filename
+            # Load wav
             y, sr = load(noisy_file)
+            # Resample if necessary
             if sr != target_sr:
                 y = torch.tensor(resample(y.numpy(), orig_sr=sr, target_sr=target_sr))
+            T_orig = y.size(1)
             # Normalize
             norm_factor = y.abs().max()
             y = y / norm_factor
             # Prepare DNN input
+            Y = torch.unsqueeze(self.model._forward_transform(self.model._stft(y.to(self.device))), 0)
             Y = pad_spec(Y, mode=pad_mode)
             # Reverse sampling
+            if self.model.sde.__class__.__name__ == 'OUVESDE':
+                if self.model.sde.sampler_type == 'pc':
+                    sampler = self.model.get_pc_sampler('reverse_diffusion', self.corrector, Y.to(self.device), N=self.N,
+                        corrector_steps=self.corrector_steps, snr=self.snr)
+                elif self.model.sde.sampler_type == 'ode':
                     sampler = self.model.get_ode_sampler(Y.to(self.device), N=self.N)
                 else:
+                    raise ValueError(f"Sampler type {self.model.sde.sampler_type} not supported")
+            elif self.model.sde.__class__.__name__ == 'SBVESDE':
+                sampler_type = 'ode' if self.model.sde.sampler_type == 'pc' else self.model.sde.sampler_type
+                sampler = self.model.get_sb_sampler(sde=self.model.sde, y=Y.cuda(), sampler_type=sampler_type)
             else:
+                raise ValueError(f"SDE {self.model.sde.__class__.__name__} not supported")
             sample, _ = sampler()
+            # Backward transform in time domain
             x_hat = self.model.to_audio(sample.squeeze(), T_orig)
+            # Renormalize
             x_hat = x_hat * norm_factor
+            # Write enhanced wav file
+            os.makedirs(os.path.dirname(os.path.join(self.enhanced_audio_path, filename)), exist_ok=True)
+            write(os.path.join(self.enhanced_audio_path, filename), x_hat.cpu().numpy(), target_sr)
     def _setup_routes(self):
+        """
+        Setup API routes:
+        /status/ : Communicates API status
+        /upload-audio/ : Upload audio files, save to noisy audio directory
+        /enhance/ : Enhance audio files, save to enhanced audio directory
+        /download-enhanced/ : Download enhanced audio files
+        /reset/ : Reset noisy and enhanced file cache
+        """
         self.app.get("/status/")(self.get_status)
         self.app.post("/prepare/")(self.prepare)
         self.app.post("/upload-audio/")(self.upload_audio)
         self.app.post("/enhance/")(self.enhance_audio)
         self.app.get("/download-enhanced/")(self.download_enhanced)
         self.app.post("/reset/")(self.reset)
     def get_status(self):
         try:
             return {"container_running": True}
         except Exception as e:
             logging.error(f"Error getting status: {e}")
+            raise fastapi.HTTPException(status_code=500, detail="An error occurred while fetching API status.")
     def prepare(self):
         try:
             self._prepare()
+            return {'preparations': True}
         except Exception as e:
             logging.error(f"Error during preparations: {e}")
+            return fastapi.HTTPException(status_code=500, detail="An error occurred while fetching API status.")
     def upload_audio(self, files: List[fastapi.UploadFile] = fastapi.File(...)):
         uploaded_files = []
         for file in files:
+            try:
+                # Define the path to save the file
                 file_path = os.path.join(self.noisy_audio_path, file.filename)
+                # Save the uploaded file
                 with open(file_path, "wb") as f:
+                    while contents := file.file.read(1024*1024):
                         f.write(contents)
+                # Append the file name to the list of uploaded files
+                uploaded_files.append(file.filename)
             except Exception as e:
+                logging.error(f"Error uploading files: {e}")
+                raise fastapi.HTTPException(status_code=500, detail="An error occurred while uploading the noisy files.")
             finally:
                 file.file.close()
         print(f"uploaded files: {uploaded_files}")
         return {"uploaded_files": uploaded_files, "status": True}
     def enhance_audio(self):
             # Enhance audio
             self._enhance()
             # Obtain list of file paths for enhanced audio
+            wav_files = glob.glob(os.path.join(self.enhanced_audio_path, '*.wav'))
             # Extract just the file names
             enhanced_files = [os.path.basename(file) for file in wav_files]
             return {"status": True}
         except Exception as e:
             print(f"Exception occured during enhancement: {e}")
+            raise fastapi.HTTPException(status_code=500, detail="An error occurred while enhancing the noisy files.")
     def download_enhanced(self):
         try:
+            # Create an in-memory zip file to hold all the enhanced audio files
             zip_buffer = io.BytesIO()
             with zipfile.ZipFile(zip_buffer, "w") as zip_file:
+                # Add each .wav file in the enhanced_audio_path directory to the zip file
+                for wav_file in glob.glob(os.path.join(self.enhanced_audio_path, '*.wav')):
                     zip_file.write(wav_file, arcname=os.path.basename(wav_file))
+            # Make sure to seek back to the start of the BytesIO object before sending it
             zip_buffer.seek(0)
+            # Send the zip file to the client as a downloadable file
             return fastapi.responses.StreamingResponse(
                 iter([zip_buffer.getvalue()]),  # Stream the in-memory content
                 media_type="application/zip",
+                headers={"Content-Disposition": "attachment; filename=enhanced_audio_files.zip"}
             )
         except Exception as e:
             logging.error(f"Error during enhanced files download: {e}")
+            raise fastapi.HTTPException(status_code=500, detail=f"An error occurred while creating the download file: {str(e)}")
     def reset(self):
         """
         Removes all audio files in preparation for another batch of enhancement.
                         os.remove(filepath)
                     except Exception as e:
                         print(f"Error removing {filepath}: {e}")
+                        return {"status": False, "noisy": os.listdir(self.noisy_audio_path), "enhanced": os.listdir(self.enhanced_audio_path)}
+        return {"status": True, "noisy": os.listdir(self.noisy_audio_path), "enhanced": os.listdir(self.enhanced_audio_path)}
     def run(self):
+        uvicorn.run(self.app, host=self.host, port=self.port)

app/miner_4.ckpt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:91992fe2205bee9a15c7c4302b7053e3ef8a9d15889c26bb19ec8529fc8a0903
-size 1312970157

 version https://git-lfs.github.com/spec/v1
+oid sha256:b546fe7ee37fa22db34470deff369aba15f31406da4a43d04fa80bf485f316d5
+size 1312981921

app/run.py CHANGED Viewed

@@ -1,14 +1,8 @@
-import sys
-from pathlib import Path
-# Add parent directory to PYTHONPATH
-sys.path.append(str(Path(__file__).resolve().parent.parent))
-from app import ModelAPI
-api = ModelAPI(
-    host = "0.0.0.0",
-    port = 6500
-)
 api.run()

+from app import ModelAPI
+api = ModelAPI(
+    host = "0.0.0.0",
+    port = 6500
+)
 api.run()

pyproject.toml CHANGED Viewed

@@ -1,58 +1,57 @@
-[build-system]
-requires = ["setuptools", "wheel"]
-build-backend = "setuptools.build_meta"
-[project]
-name = "modelapi"
-version = "1.0.0"
-description = "This project implements a container for a fine-tuned audio enhancement model."
-readme = { file = "README.md", content-type = "text/markdown" }
-license = { file = "LICENSE" }
-classifiers = [
-    "Development Status :: 3 - Beta",
-    "Intended Audience :: Developers",
-    "Topic :: Software Development :: Build Tools",
-    "License :: OSI Approved :: MIT License",
-    "Programming Language :: Python :: 3 :: Only",
-    "Programming Language :: Python :: 3.10",
-    "Topic :: Scientific/Engineering",
-    "Topic :: Scientific/Engineering :: Mathematics",
-    "Topic :: Scientific/Engineering :: Artificial Intelligence",
-    "Topic :: Software Development",
-    "Topic :: Software Development :: Libraries",
-    "Topic :: Software Development :: Libraries :: Python Modules"
-]
-requires-python = ">=3.10,<3.12"
-dependencies = [
-    "fastapi==0.115.5",
-    "uvicorn==0.32.0",
-    "python-multipart==0.0.17",
-    "h5py==3.10.0",
-    "ipympl==0.9.3",
-    "librosa==0.10.1",
-    "ninja==1.11.1.1",
-    "numpy==1.24.4",
-    "pandas==2.0.3",
-    "pesq==0.0.4",
-    "pillow==10.2.0",
-    "protobuf==4.25.2",
-    "pyarrow==15.0.0",
-    "pyroomacoustics==0.7.3",
-    "pystoi==0.4.1",
-    "pytorch-lightning==2.1.4",
-    "scipy==1.10.1",
-    "setuptools==44.0.0",
-    "seaborn==0.13.2",
-    "torch==2.2.0",
-    "torch-ema==0.3",
-    "torchaudio==2.2.0",
-    "torchvision==0.17.0",
-    "torchinfo==1.8.0",
-    "torchsde==0.2.6",
-    "gdown==5.2.0",
-    "huggingface-hub==0.31.4",
-    "torch_pesq==0.1.2",
-]
-[tool.setuptools.packages.find]
-include = ["app","model"]

+[build-system]
+requires = ["setuptools", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "modelapi"
+version = "1.0.0"
+description = "This project implements a container for a fine-tuned audio enhancement model."
+readme = { file = "README.md", content-type = "text/markdown" }
+license = { file = "LICENSE" }
+classifiers = [
+    "Development Status :: 3 - Beta",
+    "Intended Audience :: Developers",
+    "Topic :: Software Development :: Build Tools",
+    "License :: OSI Approved :: MIT License",
+    "Programming Language :: Python :: 3 :: Only",
+    "Programming Language :: Python :: 3.10",
+    "Topic :: Scientific/Engineering",
+    "Topic :: Scientific/Engineering :: Mathematics",
+    "Topic :: Scientific/Engineering :: Artificial Intelligence",
+    "Topic :: Software Development",
+    "Topic :: Software Development :: Libraries",
+    "Topic :: Software Development :: Libraries :: Python Modules"
+]
+requires-python = ">=3.10,<3.12"
+dependencies = [
+    "fastapi==0.115.5",
+    "uvicorn==0.32.0",
+    "python-multipart==0.0.17",
+    "h5py==3.10.0",
+    "ipympl==0.9.3",
+    "librosa==0.10.1",
+    "ninja==1.11.1.1",
+    "numpy==1.24.4",
+    "pandas==2.0.3",
+    "pesq==0.0.4",
+    "pillow==10.2.0",
+    "protobuf==4.25.2",
+    "pyarrow==15.0.0",
+    "pyroomacoustics==0.7.3",
+    "pystoi==0.4.1",
+    "pytorch-lightning==2.5.1",
+    "scipy==1.10.1",
+    "setuptools==44.0.0",
+    "seaborn==0.13.2",
+    "torch==2.2.0",
+    "torch-ema==0.3",
+    "torchaudio==2.2.0",
+    "torchvision==0.17.0",
+    "torchinfo==1.8.0",
+    "torchsde==0.2.6",
+    "gdown==5.2.0",
+    "torch_pesq==0.1.2"
+]
+[tool.setuptools.packages.find]
+include = ["app","model", "sgmse"]

sgmse/backbones/__init__.py CHANGED Viewed

@@ -1,7 +1,7 @@
-from .shared import BackboneRegistry
-from .ncsnpp import NCSNpp
-from .ncsnpp_v2 import NCSNpp_v2
-from .ncsnpp_48k import NCSNpp_48k
-from .dcunet import DCUNet
-__all__ = ['BackboneRegistry', 'NCSNpp', 'NCSNpp_v2', 'NCSNpp_48k', 'DCUNet']

+from .shared import BackboneRegistry
+from .ncsnpp import NCSNpp
+from .ncsnpp_v2 import NCSNpp_v2
+from .ncsnpp_48k import NCSNpp_48k
+from .dcunet import DCUNet
+__all__ = ['BackboneRegistry', 'NCSNpp', 'NCSNpp_v2', 'NCSNpp_48k', 'DCUNet']

sgmse/backbones/dcunet.py CHANGED Viewed

@@ -1,627 +1,627 @@
-from functools import partial
-import numpy as np
-import torch
-from torch import nn, Tensor
-from torch.nn.modules.batchnorm import _BatchNorm
-from .shared import BackboneRegistry, ComplexConv2d, ComplexConvTranspose2d, ComplexLinear, \
-    DiffusionStepEmbedding, GaussianFourierProjection, FeatureMapDense, torch_complex_from_reim
-def get_activation(name):
-    if name == "silu":
-        return nn.SiLU
-    elif name == "relu":
-        return nn.ReLU
-    elif name == "leaky_relu":
-        return nn.LeakyReLU
-    else:
-        raise NotImplementedError(f"Unknown activation: {name}")
-class BatchNorm(_BatchNorm):
-    def _check_input_dim(self, input):
-        if input.dim() < 2 or input.dim() > 4:
-            raise ValueError("expected 4D or 3D input (got {}D input)".format(input.dim()))
-class OnReIm(nn.Module):
-    def __init__(self, module_cls, *args, **kwargs):
-        super().__init__()
-        self.re_module = module_cls(*args, **kwargs)
-        self.im_module = module_cls(*args, **kwargs)
-    def forward(self, x):
-        return torch_complex_from_reim(self.re_module(x.real), self.im_module(x.imag))
-# Code for DCUNet largely copied from Danilo's `informedenh` repo, cheers!
-def unet_decoder_args(encoders, *, skip_connections):
-    """Get list of decoder arguments for upsampling (right) side of a symmetric u-net,
-    given the arguments used to construct the encoder.
-    Args:
-        encoders (tuple of length `N` of tuples of (in_chan, out_chan, kernel_size, stride, padding)):
-            List of arguments used to construct the encoders
-        skip_connections (bool): Whether to include skip connections in the
-            calculation of decoder input channels.
-    Return:
-        tuple of length `N` of tuples of (in_chan, out_chan, kernel_size, stride, padding):
-            Arguments to be used to construct decoders
-    """
-    decoder_args = []
-    for enc_in_chan, enc_out_chan, enc_kernel_size, enc_stride, enc_padding, enc_dilation in reversed(encoders):
-        if skip_connections and decoder_args:
-            skip_in_chan = enc_out_chan
-        else:
-            skip_in_chan = 0
-        decoder_args.append(
-            (enc_out_chan + skip_in_chan, enc_in_chan, enc_kernel_size, enc_stride, enc_padding, enc_dilation)
-        )
-    return tuple(decoder_args)
-def make_unet_encoder_decoder_args(encoder_args, decoder_args):
-    encoder_args = tuple(
-        (
-            in_chan,
-            out_chan,
-            tuple(kernel_size),
-            tuple(stride),
-            tuple([n // 2 for n in kernel_size]) if padding == "auto" else tuple(padding),
-            tuple(dilation)
-        )
-        for in_chan, out_chan, kernel_size, stride, padding, dilation in encoder_args
-    )
-    if decoder_args == "auto":
-        decoder_args = unet_decoder_args(
-            encoder_args,
-            skip_connections=True,
-        )
-    else:
-        decoder_args = tuple(
-            (
-                in_chan,
-                out_chan,
-                tuple(kernel_size),
-                tuple(stride),
-                tuple([n // 2 for n in kernel_size]) if padding == "auto" else padding,
-                tuple(dilation),
-                output_padding,
-            )
-            for in_chan, out_chan, kernel_size, stride, padding, dilation, output_padding in decoder_args
-        )
-    return encoder_args, decoder_args
-DCUNET_ARCHITECTURES = {
-    "DCUNet-10": make_unet_encoder_decoder_args(
-        # Encoders:
-        # (in_chan, out_chan, kernel_size, stride, padding, dilation)
-        (
-            (1, 32,  (7, 5), (2, 2), "auto", (1,1)),
-            (32, 64, (7, 5), (2, 2), "auto", (1,1)),
-            (64, 64, (5, 3), (2, 2), "auto", (1,1)),
-            (64, 64, (5, 3), (2, 2), "auto", (1,1)),
-            (64, 64, (5, 3), (2, 1), "auto", (1,1)),
-        ),
-        # Decoders: automatic inverse
-        "auto",
-    ),
-    "DCUNet-16": make_unet_encoder_decoder_args(
-        # Encoders:
-        # (in_chan, out_chan, kernel_size, stride, padding, dilation)
-        (
-            (1,  32, (7, 5), (2, 2), "auto", (1,1)),
-            (32, 32, (7, 5), (2, 1), "auto", (1,1)),
-            (32, 64, (7, 5), (2, 2), "auto", (1,1)),
-            (64, 64, (5, 3), (2, 1), "auto", (1,1)),
-            (64, 64, (5, 3), (2, 2), "auto", (1,1)),
-            (64, 64, (5, 3), (2, 1), "auto", (1,1)),
-            (64, 64, (5, 3), (2, 2), "auto", (1,1)),
-            (64, 64, (5, 3), (2, 1), "auto", (1,1)),
-        ),
-        # Decoders: automatic inverse
-        "auto",
-    ),
-    "DCUNet-20": make_unet_encoder_decoder_args(
-        # Encoders:
-        # (in_chan, out_chan, kernel_size, stride, padding, dilation)
-        (
-            (1,  32, (7, 1), (1, 1), "auto", (1,1)),
-            (32, 32, (1, 7), (1, 1), "auto", (1,1)),
-            (32, 64, (7, 5), (2, 2), "auto", (1,1)),
-            (64, 64, (7, 5), (2, 1), "auto", (1,1)),
-            (64, 64, (5, 3), (2, 2), "auto", (1,1)),
-            (64, 64, (5, 3), (2, 1), "auto", (1,1)),
-            (64, 64, (5, 3), (2, 2), "auto", (1,1)),
-            (64, 64, (5, 3), (2, 1), "auto", (1,1)),
-            (64, 64, (5, 3), (2, 2), "auto", (1,1)),
-            (64, 90, (5, 3), (2, 1), "auto", (1,1)),
-        ),
-        # Decoders: automatic inverse
-        "auto",
-    ),
-    "DilDCUNet-v2": make_unet_encoder_decoder_args(  # architecture used in SGMSE / Interspeech paper
-        # Encoders:
-        # (in_chan, out_chan, kernel_size, stride, padding, dilation)
-        (
-            (1,  32,   (4, 4), (1, 1), "auto", (1, 1)),
-            (32, 32,   (4, 4), (1, 1), "auto", (1, 1)),
-            (32, 32,   (4, 4), (1, 1), "auto", (1, 1)),
-            (32, 64,   (4, 4), (2, 1), "auto", (2, 1)),
-            (64, 128,  (4, 4), (2, 2), "auto", (4, 1)),
-            (128, 256, (4, 4), (2, 2), "auto", (8, 1)),
-        ),
-        # Decoders: automatic inverse
-        "auto",
-    ),
-}
-@BackboneRegistry.register("dcunet")
-class DCUNet(nn.Module):
-    @staticmethod
-    def add_argparse_args(parser):
-        parser.add_argument("--dcunet-architecture", type=str, default="DilDCUNet-v2", choices=DCUNET_ARCHITECTURES.keys(), help="The concrete DCUNet architecture. 'DilDCUNet-v2' by default.")
-        parser.add_argument("--dcunet-time-embedding", type=str, choices=("gfp", "ds", "none"), default="gfp", help="Timestep embedding style. 'gfp' (Gaussian Fourier Projections) by default.")
-        parser.add_argument("--dcunet-temb-layers-global", type=int, default=1, help="Number of global linear+activation layers for the time embedding. 1 by default.")
-        parser.add_argument("--dcunet-temb-layers-local", type=int, default=1, help="Number of local (per-encoder/per-decoder) linear+activation layers for the time embedding. 1 by default.")
-        parser.add_argument("--dcunet-temb-activation", type=str, default="silu", help="The (complex) activation to use between all (global&local) time embedding layers.")
-        parser.add_argument("--dcunet-time-embedding-complex", action="store_true", help="Use complex-valued timestep embedding. Compatible with 'gfp' and 'ds' embeddings.")
-        parser.add_argument("--dcunet-fix-length", type=str, default="pad", choices=("pad", "trim", "none"), help="DCUNet strategy to 'fix' mismatched input timespan. 'pad' by default.")
-        parser.add_argument("--dcunet-mask-bound", type=str, choices=("tanh", "sigmoid", "none"), default="none", help="DCUNet output bounding strategy. 'none' by default.")
-        parser.add_argument("--dcunet-norm-type", type=str, choices=("bN", "CbN"), default="bN", help="The type of norm to use within each encoder and decoder layer. 'bN' (real/imaginary separate batch norm) by default.")
-        parser.add_argument("--dcunet-activation", type=str, choices=("leaky_relu", "relu", "silu"), default="leaky_relu", help="The activation to use within each encoder and decoder layer. 'leaky_relu' by default.")
-        return parser
-    def __init__(
-        self,
-        dcunet_architecture: str = "DilDCUNet-v2",
-        dcunet_time_embedding: str = "gfp",
-        dcunet_temb_layers_global: int = 2,
-        dcunet_temb_layers_local: int = 1,
-        dcunet_temb_activation: str = "silu",
-        dcunet_time_embedding_complex: bool = False,
-        dcunet_fix_length: str = "pad",
-        dcunet_mask_bound: str = "none",
-        dcunet_norm_type: str = "bN",
-        dcunet_activation: str = "relu",
-        embed_dim: int = 128,
-        **kwargs
-    ):
-        super().__init__()
-        self.architecture = dcunet_architecture
-        self.fix_length_mode = (dcunet_fix_length if dcunet_fix_length != "none" else None)
-        self.norm_type = dcunet_norm_type
-        self.activation = dcunet_activation
-        self.input_channels = 2  # for x_t and y -- note that this is 2 rather than 4, because we directly treat complex channels in this DNN
-        self.time_embedding = (dcunet_time_embedding if dcunet_time_embedding != "none" else None)
-        self.time_embedding_complex = dcunet_time_embedding_complex
-        self.temb_layers_global = dcunet_temb_layers_global
-        self.temb_layers_local = dcunet_temb_layers_local
-        self.temb_activation = dcunet_temb_activation
-        conf_encoders, conf_decoders = DCUNET_ARCHITECTURES[dcunet_architecture]
-        # Replace `input_channels` in encoders config
-        _replaced_input_channels, *rest = conf_encoders[0]
-        encoders = ((self.input_channels, *rest), *conf_encoders[1:])
-        decoders = conf_decoders
-        self.encoders_stride_product = np.prod(
-            [enc_stride for _, _, _, enc_stride, _, _ in encoders], axis=0
-        )
-        # Prepare kwargs for encoder and decoder (to potentially be modified before layer instantiation)
-        encoder_decoder_kwargs = dict(
-            norm_type=self.norm_type, activation=self.activation,
-            temb_layers=self.temb_layers_local, temb_activation=self.temb_activation)
-        # Instantiate (global) time embedding layer
-        embed_ops = []
-        if self.time_embedding is not None:
-            complex_valued = self.time_embedding_complex
-            if self.time_embedding == "gfp":
-                embed_ops += [GaussianFourierProjection(embed_dim=embed_dim, complex_valued=complex_valued)]
-                encoder_decoder_kwargs["embed_dim"] = embed_dim
-            elif self.time_embedding == "ds":
-                embed_ops += [DiffusionStepEmbedding(embed_dim=embed_dim, complex_valued=complex_valued)]
-                encoder_decoder_kwargs["embed_dim"] = embed_dim
-            if self.time_embedding_complex:
-                assert self.time_embedding in ("gfp", "ds"), "Complex timestep embedding only available for gfp and ds"
-                encoder_decoder_kwargs["complex_time_embedding"] = True
-            for _ in range(self.temb_layers_global):
-                embed_ops += [
-                    ComplexLinear(embed_dim, embed_dim, complex_valued=True),
-                    OnReIm(get_activation(dcunet_temb_activation))
-                ]
-        self.embed = nn.Sequential(*embed_ops)
-        ### Instantiate DCUNet layers ###
-        output_layer = ComplexConvTranspose2d(*decoders[-1])
-        encoders = [DCUNetComplexEncoderBlock(*args, **encoder_decoder_kwargs) for args in encoders]
-        decoders = [DCUNetComplexDecoderBlock(*args, **encoder_decoder_kwargs) for args in decoders[:-1]]
-        self.mask_bound = (dcunet_mask_bound if dcunet_mask_bound != "none" else None)
-        if self.mask_bound is not None:
-            raise NotImplementedError("sorry, mask bounding not implemented at the moment")
-            # TODO we can't use nn.Sequential since the ComplexConvTranspose2d needs a second `output_size` argument
-        #operations = (output_layer, complex_nn.BoundComplexMask(self.mask_bound))
-        #output_layer = nn.Sequential(*[x for x in operations if x is not None])
-        assert len(encoders) == len(decoders) + 1
-        self.encoders = nn.ModuleList(encoders)
-        self.decoders = nn.ModuleList(decoders)
-        self.output_layer = output_layer or nn.Identity()
-    def forward(self, spec, t) -> Tensor:
-        """
-        Input shape is expected to be $(batch, nfreqs, time)$, with $nfreqs - 1$ divisible
-        by $f_0 * f_1 * ... * f_N$ where $f_k$ are the frequency strides of the encoders,
-        and $time - 1$ is divisible by $t_0 * t_1 * ... * t_N$ where $t_N$ are the time
-        strides of the encoders.
-        Args:
-            spec (Tensor): complex spectrogram tensor. 1D, 2D or 3D tensor, time last.
-        Returns:
-            Tensor, of shape (batch, time) or (time).
-        """
-        # TF-rep shape: (batch, self.input_channels, n_fft, frames)
-        # Estimate mask from time-frequency representation.
-        x_in = self.fix_input_dims(spec)
-        x = x_in
-        t_embed = self.embed(t+0j) if self.time_embedding is not None else None
-        enc_outs = []
-        for idx, enc in enumerate(self.encoders):
-            x = enc(x, t_embed)
-            # UNet skip connection
-            enc_outs.append(x)
-        for (enc_out, dec) in zip(reversed(enc_outs[:-1]), self.decoders):
-            x = dec(x, t_embed, output_size=enc_out.shape)
-            x = torch.cat([x, enc_out], dim=1)
-        output = self.output_layer(x, output_size=x_in.shape)
-        # output shape: (batch, 1, n_fft, frames)
-        output = self.fix_output_dims(output, spec)
-        return output
-    def fix_input_dims(self, x):
-        return _fix_dcu_input_dims(
-            self.fix_length_mode, x, torch.from_numpy(self.encoders_stride_product)
-        )
-    def fix_output_dims(self, out, x):
-        return _fix_dcu_output_dims(self.fix_length_mode, out, x)
-def _fix_dcu_input_dims(fix_length_mode, x, encoders_stride_product):
-    """Pad or trim `x` to a length compatible with DCUNet."""
-    freq_prod = int(encoders_stride_product[0])
-    time_prod = int(encoders_stride_product[1])
-    if (x.shape[2] - 1) % freq_prod:
-        raise TypeError(
-            f"Input shape must be [batch, ch, freq + 1, time + 1] with freq divisible by "
-            f"{freq_prod}, got {x.shape} instead"
-        )
-    time_remainder = (x.shape[3] - 1) % time_prod
-    if time_remainder:
-        if fix_length_mode is None:
-            raise TypeError(
-                f"Input shape must be [batch, ch, freq + 1, time + 1] with time divisible by "
-                f"{time_prod}, got {x.shape} instead. Set the 'fix_length_mode' argument "
-                f"in 'DCUNet' to 'pad' or 'trim' to fix shapes automatically."
-            )
-        elif fix_length_mode == "pad":
-            pad_shape = [0, time_prod - time_remainder]
-            x = nn.functional.pad(x, pad_shape, mode="constant")
-        elif fix_length_mode == "trim":
-            pad_shape = [0, -time_remainder]
-            x = nn.functional.pad(x, pad_shape, mode="constant")
-        else:
-            raise ValueError(f"Unknown fix_length mode '{fix_length_mode}'")
-    return x
-def _fix_dcu_output_dims(fix_length_mode, out, x):
-    """Fix shape of `out` to the original shape of `x` by padding/cropping."""
-    inp_len = x.shape[-1]
-    output_len = out.shape[-1]
-    return nn.functional.pad(out, [0, inp_len - output_len])
-def _get_norm(norm_type):
-    if norm_type == "CbN":
-        return ComplexBatchNorm
-    elif norm_type == "bN":
-        return partial(OnReIm, BatchNorm)
-    else:
-        raise NotImplementedError(f"Unknown norm type: {norm_type}")
-class DCUNetComplexEncoderBlock(nn.Module):
-    def __init__(
-        self,
-        in_chan,
-        out_chan,
-        kernel_size,
-        stride,
-        padding,
-        dilation,
-        norm_type="bN",
-        activation="leaky_relu",
-        embed_dim=None,
-        complex_time_embedding=False,
-        temb_layers=1,
-        temb_activation="silu"
-    ):
-        super().__init__()
-        self.in_chan = in_chan
-        self.out_chan = out_chan
-        self.kernel_size = kernel_size
-        self.stride = stride
-        self.padding = padding
-        self.dilation = dilation
-        self.temb_layers = temb_layers
-        self.temb_activation = temb_activation
-        self.complex_time_embedding = complex_time_embedding
-        self.conv = ComplexConv2d(
-            in_chan, out_chan, kernel_size, stride, padding, bias=norm_type is None, dilation=dilation
-        )
-        self.norm = _get_norm(norm_type)(out_chan)
-        self.activation = OnReIm(get_activation(activation))
-        self.embed_dim = embed_dim
-        if self.embed_dim is not None:
-            ops = []
-            for _ in range(max(0, self.temb_layers - 1)):
-                ops += [
-                    ComplexLinear(self.embed_dim, self.embed_dim, complex_valued=True),
-                    OnReIm(get_activation(self.temb_activation))
-                ]
-            ops += [
-                FeatureMapDense(self.embed_dim, self.out_chan, complex_valued=True),
-                OnReIm(get_activation(self.temb_activation))
-            ]
-            self.embed_layer = nn.Sequential(*ops)
-    def forward(self, x, t_embed):
-        y = self.conv(x)
-        if self.embed_dim is not None:
-            y = y + self.embed_layer(t_embed)
-        return self.activation(self.norm(y))
-class DCUNetComplexDecoderBlock(nn.Module):
-    def __init__(
-        self,
-        in_chan,
-        out_chan,
-        kernel_size,
-        stride,
-        padding,
-        dilation,
-        output_padding=(0, 0),
-        norm_type="bN",
-        activation="leaky_relu",
-        embed_dim=None,
-        temb_layers=1,
-        temb_activation='swish',
-        complex_time_embedding=False,
-    ):
-        super().__init__()
-        self.in_chan = in_chan
-        self.out_chan = out_chan
-        self.kernel_size = kernel_size
-        self.stride = stride
-        self.padding = padding
-        self.dilation = dilation
-        self.output_padding = output_padding
-        self.complex_time_embedding = complex_time_embedding
-        self.temb_layers = temb_layers
-        self.temb_activation = temb_activation
-        self.deconv = ComplexConvTranspose2d(
-            in_chan, out_chan, kernel_size, stride, padding, output_padding, dilation=dilation, bias=norm_type is None
-        )
-        self.norm = _get_norm(norm_type)(out_chan)
-        self.activation = OnReIm(get_activation(activation))
-        self.embed_dim = embed_dim
-        if self.embed_dim is not None:
-            ops = []
-            for _ in range(max(0, self.temb_layers - 1)):
-                ops += [
-                    ComplexLinear(self.embed_dim, self.embed_dim, complex_valued=True),
-                    OnReIm(get_activation(self.temb_activation))
-                ]
-            ops += [
-                FeatureMapDense(self.embed_dim, self.out_chan, complex_valued=True),
-                OnReIm(get_activation(self.temb_activation))
-            ]
-            self.embed_layer = nn.Sequential(*ops)
-    def forward(self, x, t_embed, output_size=None):
-        y = self.deconv(x, output_size=output_size)
-        if self.embed_dim is not None:
-            y = y + self.embed_layer(t_embed)
-        return self.activation(self.norm(y))
-# From https://github.com/chanil1218/DCUnet.pytorch/blob/2dcdd30804be47a866fde6435cbb7e2f81585213/models/layers/complexnn.py
-class ComplexBatchNorm(torch.nn.Module):
-    def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True, track_running_stats=False):
-        super(ComplexBatchNorm, self).__init__()
-        self.num_features        = num_features
-        self.eps                 = eps
-        self.momentum            = momentum
-        self.affine              = affine
-        self.track_running_stats = track_running_stats
-        if self.affine:
-            self.Wrr = torch.nn.Parameter(torch.Tensor(num_features))
-            self.Wri = torch.nn.Parameter(torch.Tensor(num_features))
-            self.Wii = torch.nn.Parameter(torch.Tensor(num_features))
-            self.Br  = torch.nn.Parameter(torch.Tensor(num_features))
-            self.Bi  = torch.nn.Parameter(torch.Tensor(num_features))
-        else:
-            self.register_parameter('Wrr', None)
-            self.register_parameter('Wri', None)
-            self.register_parameter('Wii', None)
-            self.register_parameter('Br',  None)
-            self.register_parameter('Bi',  None)
-        if self.track_running_stats:
-            self.register_buffer('RMr',  torch.zeros(num_features))
-            self.register_buffer('RMi',  torch.zeros(num_features))
-            self.register_buffer('RVrr', torch.ones (num_features))
-            self.register_buffer('RVri', torch.zeros(num_features))
-            self.register_buffer('RVii', torch.ones (num_features))
-            self.register_buffer('num_batches_tracked', torch.tensor(0, dtype=torch.long))
-        else:
-            self.register_parameter('RMr',                 None)
-            self.register_parameter('RMi',                 None)
-            self.register_parameter('RVrr',                None)
-            self.register_parameter('RVri',                None)
-            self.register_parameter('RVii',                None)
-            self.register_parameter('num_batches_tracked', None)
-        self.reset_parameters()
-    def reset_running_stats(self):
-        if self.track_running_stats:
-            self.RMr.zero_()
-            self.RMi.zero_()
-            self.RVrr.fill_(1)
-            self.RVri.zero_()
-            self.RVii.fill_(1)
-            self.num_batches_tracked.zero_()
-    def reset_parameters(self):
-        self.reset_running_stats()
-        if self.affine:
-            self.Br.data.zero_()
-            self.Bi.data.zero_()
-            self.Wrr.data.fill_(1)
-            self.Wri.data.uniform_(-.9, +.9) # W will be positive-definite
-            self.Wii.data.fill_(1)
-    def _check_input_dim(self, xr, xi):
-        assert(xr.shape == xi.shape)
-        assert(xr.size(1) == self.num_features)
-    def forward(self, x):
-        xr, xi = x.real, x.imag
-        self._check_input_dim(xr, xi)
-        exponential_average_factor = 0.0
-        if self.training and self.track_running_stats:
-            self.num_batches_tracked += 1
-            if self.momentum is None:  # use cumulative moving average
-                exponential_average_factor = 1.0 / self.num_batches_tracked.item()
-            else:  # use exponential moving average
-                exponential_average_factor = self.momentum
-        #
-        # NOTE: The precise meaning of the "training flag" is:
-        #       True:  Normalize using batch   statistics, update running statistics
-        #              if they are being collected.
-        #       False: Normalize using running statistics, ignore batch   statistics.
-        #
-        training = self.training or not self.track_running_stats
-        redux = [i for i in reversed(range(xr.dim())) if i!=1]
-        vdim  = [1] * xr.dim()
-        vdim[1] = xr.size(1)
-        #
-        # Mean M Computation and Centering
-        #
-        # Includes running mean update if training and running.
-        #
-        if training:
-            Mr, Mi = xr, xi
-            for d in redux:
-                Mr = Mr.mean(d, keepdim=True)
-                Mi = Mi.mean(d, keepdim=True)
-            if self.track_running_stats:
-                self.RMr.lerp_(Mr.squeeze(), exponential_average_factor)
-                self.RMi.lerp_(Mi.squeeze(), exponential_average_factor)
-        else:
-            Mr = self.RMr.view(vdim)
-            Mi = self.RMi.view(vdim)
-        xr, xi = xr-Mr, xi-Mi
-        #
-        # Variance Matrix V Computation
-        #
-        # Includes epsilon numerical stabilizer/Tikhonov regularizer.
-        # Includes running variance update if training and running.
-        #
-        if training:
-            Vrr = xr * xr
-            Vri = xr * xi
-            Vii = xi * xi
-            for d in redux:
-                Vrr = Vrr.mean(d, keepdim=True)
-                Vri = Vri.mean(d, keepdim=True)
-                Vii = Vii.mean(d, keepdim=True)
-            if self.track_running_stats:
-                self.RVrr.lerp_(Vrr.squeeze(), exponential_average_factor)
-                self.RVri.lerp_(Vri.squeeze(), exponential_average_factor)
-                self.RVii.lerp_(Vii.squeeze(), exponential_average_factor)
-        else:
-            Vrr = self.RVrr.view(vdim)
-            Vri = self.RVri.view(vdim)
-            Vii = self.RVii.view(vdim)
-        Vrr   = Vrr + self.eps
-        Vri   = Vri
-        Vii   = Vii + self.eps
-        #
-        # Matrix Inverse Square Root U = V^-0.5
-        #
-        # sqrt of a 2x2 matrix,
-        # - https://en.wikipedia.org/wiki/Square_root_of_a_2_by_2_matrix
-        tau   = Vrr + Vii
-        delta = torch.addcmul(Vrr * Vii, Vri, Vri, value=-1)
-        s     = delta.sqrt()
-        t     = (tau + 2*s).sqrt()
-        # matrix inverse, http://mathworld.wolfram.com/MatrixInverse.html
-        rst   = (s * t).reciprocal()
-        Urr   = (s + Vii) * rst
-        Uii   = (s + Vrr) * rst
-        Uri   = (  - Vri) * rst
-        #
-        # Optionally left-multiply U by affine weights W to produce combined
-        # weights Z, left-multiply the inputs by Z, then optionally bias them.
-        #
-        # y = Zx + B
-        # y = WUx + B
-        # y = [Wrr Wri][Urr Uri] [xr] + [Br]
-        #     [Wir Wii][Uir Uii] [xi]   [Bi]
-        #
-        if self.affine:
-            Wrr, Wri, Wii = self.Wrr.view(vdim), self.Wri.view(vdim), self.Wii.view(vdim)
-            Zrr = (Wrr * Urr) + (Wri * Uri)
-            Zri = (Wrr * Uri) + (Wri * Uii)
-            Zir = (Wri * Urr) + (Wii * Uri)
-            Zii = (Wri * Uri) + (Wii * Uii)
-        else:
-            Zrr, Zri, Zir, Zii = Urr, Uri, Uri, Uii
-        yr = (Zrr * xr) + (Zri * xi)
-        yi = (Zir * xr) + (Zii * xi)
-        if self.affine:
-            yr = yr + self.Br.view(vdim)
-            yi = yi + self.Bi.view(vdim)
-        return torch.view_as_complex(torch.stack([yr, yi], dim=-1))
-    def extra_repr(self):
-        return '{num_features}, eps={eps}, momentum={momentum}, affine={affine}, ' \
-                'track_running_stats={track_running_stats}'.format(**self.__dict__)

+from functools import partial
+import numpy as np
+import torch
+from torch import nn, Tensor
+from torch.nn.modules.batchnorm import _BatchNorm
+from .shared import BackboneRegistry, ComplexConv2d, ComplexConvTranspose2d, ComplexLinear, \
+    DiffusionStepEmbedding, GaussianFourierProjection, FeatureMapDense, torch_complex_from_reim
+def get_activation(name):
+    if name == "silu":
+        return nn.SiLU
+    elif name == "relu":
+        return nn.ReLU
+    elif name == "leaky_relu":
+        return nn.LeakyReLU
+    else:
+        raise NotImplementedError(f"Unknown activation: {name}")
+class BatchNorm(_BatchNorm):
+    def _check_input_dim(self, input):
+        if input.dim() < 2 or input.dim() > 4:
+            raise ValueError("expected 4D or 3D input (got {}D input)".format(input.dim()))
+class OnReIm(nn.Module):
+    def __init__(self, module_cls, *args, **kwargs):
+        super().__init__()
+        self.re_module = module_cls(*args, **kwargs)
+        self.im_module = module_cls(*args, **kwargs)
+    def forward(self, x):
+        return torch_complex_from_reim(self.re_module(x.real), self.im_module(x.imag))
+# Code for DCUNet largely copied from Danilo's `informedenh` repo, cheers!
+def unet_decoder_args(encoders, *, skip_connections):
+    """Get list of decoder arguments for upsampling (right) side of a symmetric u-net,
+    given the arguments used to construct the encoder.
+    Args:
+        encoders (tuple of length `N` of tuples of (in_chan, out_chan, kernel_size, stride, padding)):
+            List of arguments used to construct the encoders
+        skip_connections (bool): Whether to include skip connections in the
+            calculation of decoder input channels.
+    Return:
+        tuple of length `N` of tuples of (in_chan, out_chan, kernel_size, stride, padding):
+            Arguments to be used to construct decoders
+    """
+    decoder_args = []
+    for enc_in_chan, enc_out_chan, enc_kernel_size, enc_stride, enc_padding, enc_dilation in reversed(encoders):
+        if skip_connections and decoder_args:
+            skip_in_chan = enc_out_chan
+        else:
+            skip_in_chan = 0
+        decoder_args.append(
+            (enc_out_chan + skip_in_chan, enc_in_chan, enc_kernel_size, enc_stride, enc_padding, enc_dilation)
+        )
+    return tuple(decoder_args)
+def make_unet_encoder_decoder_args(encoder_args, decoder_args):
+    encoder_args = tuple(
+        (
+            in_chan,
+            out_chan,
+            tuple(kernel_size),
+            tuple(stride),
+            tuple([n // 2 for n in kernel_size]) if padding == "auto" else tuple(padding),
+            tuple(dilation)
+        )
+        for in_chan, out_chan, kernel_size, stride, padding, dilation in encoder_args
+    )
+    if decoder_args == "auto":
+        decoder_args = unet_decoder_args(
+            encoder_args,
+            skip_connections=True,
+        )
+    else:
+        decoder_args = tuple(
+            (
+                in_chan,
+                out_chan,
+                tuple(kernel_size),
+                tuple(stride),
+                tuple([n // 2 for n in kernel_size]) if padding == "auto" else padding,
+                tuple(dilation),
+                output_padding,
+            )
+            for in_chan, out_chan, kernel_size, stride, padding, dilation, output_padding in decoder_args
+        )
+    return encoder_args, decoder_args
+DCUNET_ARCHITECTURES = {
+    "DCUNet-10": make_unet_encoder_decoder_args(
+        # Encoders:
+        # (in_chan, out_chan, kernel_size, stride, padding, dilation)
+        (
+            (1, 32,  (7, 5), (2, 2), "auto", (1,1)),
+            (32, 64, (7, 5), (2, 2), "auto", (1,1)),
+            (64, 64, (5, 3), (2, 2), "auto", (1,1)),
+            (64, 64, (5, 3), (2, 2), "auto", (1,1)),
+            (64, 64, (5, 3), (2, 1), "auto", (1,1)),
+        ),
+        # Decoders: automatic inverse
+        "auto",
+    ),
+    "DCUNet-16": make_unet_encoder_decoder_args(
+        # Encoders:
+        # (in_chan, out_chan, kernel_size, stride, padding, dilation)
+        (
+            (1,  32, (7, 5), (2, 2), "auto", (1,1)),
+            (32, 32, (7, 5), (2, 1), "auto", (1,1)),
+            (32, 64, (7, 5), (2, 2), "auto", (1,1)),
+            (64, 64, (5, 3), (2, 1), "auto", (1,1)),
+            (64, 64, (5, 3), (2, 2), "auto", (1,1)),
+            (64, 64, (5, 3), (2, 1), "auto", (1,1)),
+            (64, 64, (5, 3), (2, 2), "auto", (1,1)),
+            (64, 64, (5, 3), (2, 1), "auto", (1,1)),
+        ),
+        # Decoders: automatic inverse
+        "auto",
+    ),
+    "DCUNet-20": make_unet_encoder_decoder_args(
+        # Encoders:
+        # (in_chan, out_chan, kernel_size, stride, padding, dilation)
+        (
+            (1,  32, (7, 1), (1, 1), "auto", (1,1)),
+            (32, 32, (1, 7), (1, 1), "auto", (1,1)),
+            (32, 64, (7, 5), (2, 2), "auto", (1,1)),
+            (64, 64, (7, 5), (2, 1), "auto", (1,1)),
+            (64, 64, (5, 3), (2, 2), "auto", (1,1)),
+            (64, 64, (5, 3), (2, 1), "auto", (1,1)),
+            (64, 64, (5, 3), (2, 2), "auto", (1,1)),
+            (64, 64, (5, 3), (2, 1), "auto", (1,1)),
+            (64, 64, (5, 3), (2, 2), "auto", (1,1)),
+            (64, 90, (5, 3), (2, 1), "auto", (1,1)),
+        ),
+        # Decoders: automatic inverse
+        "auto",
+    ),
+    "DilDCUNet-v2": make_unet_encoder_decoder_args(  # architecture used in SGMSE / Interspeech paper
+        # Encoders:
+        # (in_chan, out_chan, kernel_size, stride, padding, dilation)
+        (
+            (1,  32,   (4, 4), (1, 1), "auto", (1, 1)),
+            (32, 32,   (4, 4), (1, 1), "auto", (1, 1)),
+            (32, 32,   (4, 4), (1, 1), "auto", (1, 1)),
+            (32, 64,   (4, 4), (2, 1), "auto", (2, 1)),
+            (64, 128,  (4, 4), (2, 2), "auto", (4, 1)),
+            (128, 256, (4, 4), (2, 2), "auto", (8, 1)),
+        ),
+        # Decoders: automatic inverse
+        "auto",
+    ),
+}
+@BackboneRegistry.register("dcunet")
+class DCUNet(nn.Module):
+    @staticmethod
+    def add_argparse_args(parser):
+        parser.add_argument("--dcunet-architecture", type=str, default="DilDCUNet-v2", choices=DCUNET_ARCHITECTURES.keys(), help="The concrete DCUNet architecture. 'DilDCUNet-v2' by default.")
+        parser.add_argument("--dcunet-time-embedding", type=str, choices=("gfp", "ds", "none"), default="gfp", help="Timestep embedding style. 'gfp' (Gaussian Fourier Projections) by default.")
+        parser.add_argument("--dcunet-temb-layers-global", type=int, default=1, help="Number of global linear+activation layers for the time embedding. 1 by default.")
+        parser.add_argument("--dcunet-temb-layers-local", type=int, default=1, help="Number of local (per-encoder/per-decoder) linear+activation layers for the time embedding. 1 by default.")
+        parser.add_argument("--dcunet-temb-activation", type=str, default="silu", help="The (complex) activation to use between all (global&local) time embedding layers.")
+        parser.add_argument("--dcunet-time-embedding-complex", action="store_true", help="Use complex-valued timestep embedding. Compatible with 'gfp' and 'ds' embeddings.")
+        parser.add_argument("--dcunet-fix-length", type=str, default="pad", choices=("pad", "trim", "none"), help="DCUNet strategy to 'fix' mismatched input timespan. 'pad' by default.")
+        parser.add_argument("--dcunet-mask-bound", type=str, choices=("tanh", "sigmoid", "none"), default="none", help="DCUNet output bounding strategy. 'none' by default.")
+        parser.add_argument("--dcunet-norm-type", type=str, choices=("bN", "CbN"), default="bN", help="The type of norm to use within each encoder and decoder layer. 'bN' (real/imaginary separate batch norm) by default.")
+        parser.add_argument("--dcunet-activation", type=str, choices=("leaky_relu", "relu", "silu"), default="leaky_relu", help="The activation to use within each encoder and decoder layer. 'leaky_relu' by default.")
+        return parser
+    def __init__(
+        self,
+        dcunet_architecture: str = "DilDCUNet-v2",
+        dcunet_time_embedding: str = "gfp",
+        dcunet_temb_layers_global: int = 2,
+        dcunet_temb_layers_local: int = 1,
+        dcunet_temb_activation: str = "silu",
+        dcunet_time_embedding_complex: bool = False,
+        dcunet_fix_length: str = "pad",
+        dcunet_mask_bound: str = "none",
+        dcunet_norm_type: str = "bN",
+        dcunet_activation: str = "relu",
+        embed_dim: int = 128,
+        **kwargs
+    ):
+        super().__init__()
+        self.architecture = dcunet_architecture
+        self.fix_length_mode = (dcunet_fix_length if dcunet_fix_length != "none" else None)
+        self.norm_type = dcunet_norm_type
+        self.activation = dcunet_activation
+        self.input_channels = 2  # for x_t and y -- note that this is 2 rather than 4, because we directly treat complex channels in this DNN
+        self.time_embedding = (dcunet_time_embedding if dcunet_time_embedding != "none" else None)
+        self.time_embedding_complex = dcunet_time_embedding_complex
+        self.temb_layers_global = dcunet_temb_layers_global
+        self.temb_layers_local = dcunet_temb_layers_local
+        self.temb_activation = dcunet_temb_activation
+        conf_encoders, conf_decoders = DCUNET_ARCHITECTURES[dcunet_architecture]
+        # Replace `input_channels` in encoders config
+        _replaced_input_channels, *rest = conf_encoders[0]
+        encoders = ((self.input_channels, *rest), *conf_encoders[1:])
+        decoders = conf_decoders
+        self.encoders_stride_product = np.prod(
+            [enc_stride for _, _, _, enc_stride, _, _ in encoders], axis=0
+        )
+        # Prepare kwargs for encoder and decoder (to potentially be modified before layer instantiation)
+        encoder_decoder_kwargs = dict(
+            norm_type=self.norm_type, activation=self.activation,
+            temb_layers=self.temb_layers_local, temb_activation=self.temb_activation)
+        # Instantiate (global) time embedding layer
+        embed_ops = []
+        if self.time_embedding is not None:
+            complex_valued = self.time_embedding_complex
+            if self.time_embedding == "gfp":
+                embed_ops += [GaussianFourierProjection(embed_dim=embed_dim, complex_valued=complex_valued)]
+                encoder_decoder_kwargs["embed_dim"] = embed_dim
+            elif self.time_embedding == "ds":
+                embed_ops += [DiffusionStepEmbedding(embed_dim=embed_dim, complex_valued=complex_valued)]
+                encoder_decoder_kwargs["embed_dim"] = embed_dim
+            if self.time_embedding_complex:
+                assert self.time_embedding in ("gfp", "ds"), "Complex timestep embedding only available for gfp and ds"
+                encoder_decoder_kwargs["complex_time_embedding"] = True
+            for _ in range(self.temb_layers_global):
+                embed_ops += [
+                    ComplexLinear(embed_dim, embed_dim, complex_valued=True),
+                    OnReIm(get_activation(dcunet_temb_activation))
+                ]
+        self.embed = nn.Sequential(*embed_ops)
+        ### Instantiate DCUNet layers ###
+        output_layer = ComplexConvTranspose2d(*decoders[-1])
+        encoders = [DCUNetComplexEncoderBlock(*args, **encoder_decoder_kwargs) for args in encoders]
+        decoders = [DCUNetComplexDecoderBlock(*args, **encoder_decoder_kwargs) for args in decoders[:-1]]
+        self.mask_bound = (dcunet_mask_bound if dcunet_mask_bound != "none" else None)
+        if self.mask_bound is not None:
+            raise NotImplementedError("sorry, mask bounding not implemented at the moment")
+            # TODO we can't use nn.Sequential since the ComplexConvTranspose2d needs a second `output_size` argument
+        #operations = (output_layer, complex_nn.BoundComplexMask(self.mask_bound))
+        #output_layer = nn.Sequential(*[x for x in operations if x is not None])
+        assert len(encoders) == len(decoders) + 1
+        self.encoders = nn.ModuleList(encoders)
+        self.decoders = nn.ModuleList(decoders)
+        self.output_layer = output_layer or nn.Identity()
+    def forward(self, spec, t) -> Tensor:
+        """
+        Input shape is expected to be $(batch, nfreqs, time)$, with $nfreqs - 1$ divisible
+        by $f_0 * f_1 * ... * f_N$ where $f_k$ are the frequency strides of the encoders,
+        and $time - 1$ is divisible by $t_0 * t_1 * ... * t_N$ where $t_N$ are the time
+        strides of the encoders.
+        Args:
+            spec (Tensor): complex spectrogram tensor. 1D, 2D or 3D tensor, time last.
+        Returns:
+            Tensor, of shape (batch, time) or (time).
+        """
+        # TF-rep shape: (batch, self.input_channels, n_fft, frames)
+        # Estimate mask from time-frequency representation.
+        x_in = self.fix_input_dims(spec)
+        x = x_in
+        t_embed = self.embed(t+0j) if self.time_embedding is not None else None
+        enc_outs = []
+        for idx, enc in enumerate(self.encoders):
+            x = enc(x, t_embed)
+            # UNet skip connection
+            enc_outs.append(x)
+        for (enc_out, dec) in zip(reversed(enc_outs[:-1]), self.decoders):
+            x = dec(x, t_embed, output_size=enc_out.shape)
+            x = torch.cat([x, enc_out], dim=1)
+        output = self.output_layer(x, output_size=x_in.shape)
+        # output shape: (batch, 1, n_fft, frames)
+        output = self.fix_output_dims(output, spec)
+        return output
+    def fix_input_dims(self, x):
+        return _fix_dcu_input_dims(
+            self.fix_length_mode, x, torch.from_numpy(self.encoders_stride_product)
+        )
+    def fix_output_dims(self, out, x):
+        return _fix_dcu_output_dims(self.fix_length_mode, out, x)
+def _fix_dcu_input_dims(fix_length_mode, x, encoders_stride_product):
+    """Pad or trim `x` to a length compatible with DCUNet."""
+    freq_prod = int(encoders_stride_product[0])
+    time_prod = int(encoders_stride_product[1])
+    if (x.shape[2] - 1) % freq_prod:
+        raise TypeError(
+            f"Input shape must be [batch, ch, freq + 1, time + 1] with freq divisible by "
+            f"{freq_prod}, got {x.shape} instead"
+        )
+    time_remainder = (x.shape[3] - 1) % time_prod
+    if time_remainder:
+        if fix_length_mode is None:
+            raise TypeError(
+                f"Input shape must be [batch, ch, freq + 1, time + 1] with time divisible by "
+                f"{time_prod}, got {x.shape} instead. Set the 'fix_length_mode' argument "
+                f"in 'DCUNet' to 'pad' or 'trim' to fix shapes automatically."
+            )
+        elif fix_length_mode == "pad":
+            pad_shape = [0, time_prod - time_remainder]
+            x = nn.functional.pad(x, pad_shape, mode="constant")
+        elif fix_length_mode == "trim":
+            pad_shape = [0, -time_remainder]
+            x = nn.functional.pad(x, pad_shape, mode="constant")
+        else:
+            raise ValueError(f"Unknown fix_length mode '{fix_length_mode}'")
+    return x
+def _fix_dcu_output_dims(fix_length_mode, out, x):
+    """Fix shape of `out` to the original shape of `x` by padding/cropping."""
+    inp_len = x.shape[-1]
+    output_len = out.shape[-1]
+    return nn.functional.pad(out, [0, inp_len - output_len])
+def _get_norm(norm_type):
+    if norm_type == "CbN":
+        return ComplexBatchNorm
+    elif norm_type == "bN":
+        return partial(OnReIm, BatchNorm)
+    else:
+        raise NotImplementedError(f"Unknown norm type: {norm_type}")
+class DCUNetComplexEncoderBlock(nn.Module):
+    def __init__(
+        self,
+        in_chan,
+        out_chan,
+        kernel_size,
+        stride,
+        padding,
+        dilation,
+        norm_type="bN",
+        activation="leaky_relu",
+        embed_dim=None,
+        complex_time_embedding=False,
+        temb_layers=1,
+        temb_activation="silu"
+    ):
+        super().__init__()
+        self.in_chan = in_chan
+        self.out_chan = out_chan
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.temb_layers = temb_layers
+        self.temb_activation = temb_activation
+        self.complex_time_embedding = complex_time_embedding
+        self.conv = ComplexConv2d(
+            in_chan, out_chan, kernel_size, stride, padding, bias=norm_type is None, dilation=dilation
+        )
+        self.norm = _get_norm(norm_type)(out_chan)
+        self.activation = OnReIm(get_activation(activation))
+        self.embed_dim = embed_dim
+        if self.embed_dim is not None:
+            ops = []
+            for _ in range(max(0, self.temb_layers - 1)):
+                ops += [
+                    ComplexLinear(self.embed_dim, self.embed_dim, complex_valued=True),
+                    OnReIm(get_activation(self.temb_activation))
+                ]
+            ops += [
+                FeatureMapDense(self.embed_dim, self.out_chan, complex_valued=True),
+                OnReIm(get_activation(self.temb_activation))
+            ]
+            self.embed_layer = nn.Sequential(*ops)
+    def forward(self, x, t_embed):
+        y = self.conv(x)
+        if self.embed_dim is not None:
+            y = y + self.embed_layer(t_embed)
+        return self.activation(self.norm(y))
+class DCUNetComplexDecoderBlock(nn.Module):
+    def __init__(
+        self,
+        in_chan,
+        out_chan,
+        kernel_size,
+        stride,
+        padding,
+        dilation,
+        output_padding=(0, 0),
+        norm_type="bN",
+        activation="leaky_relu",
+        embed_dim=None,
+        temb_layers=1,
+        temb_activation='swish',
+        complex_time_embedding=False,
+    ):
+        super().__init__()
+        self.in_chan = in_chan
+        self.out_chan = out_chan
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.output_padding = output_padding
+        self.complex_time_embedding = complex_time_embedding
+        self.temb_layers = temb_layers
+        self.temb_activation = temb_activation
+        self.deconv = ComplexConvTranspose2d(
+            in_chan, out_chan, kernel_size, stride, padding, output_padding, dilation=dilation, bias=norm_type is None
+        )
+        self.norm = _get_norm(norm_type)(out_chan)
+        self.activation = OnReIm(get_activation(activation))
+        self.embed_dim = embed_dim
+        if self.embed_dim is not None:
+            ops = []
+            for _ in range(max(0, self.temb_layers - 1)):
+                ops += [
+                    ComplexLinear(self.embed_dim, self.embed_dim, complex_valued=True),
+                    OnReIm(get_activation(self.temb_activation))
+                ]
+            ops += [
+                FeatureMapDense(self.embed_dim, self.out_chan, complex_valued=True),
+                OnReIm(get_activation(self.temb_activation))
+            ]
+            self.embed_layer = nn.Sequential(*ops)
+    def forward(self, x, t_embed, output_size=None):
+        y = self.deconv(x, output_size=output_size)
+        if self.embed_dim is not None:
+            y = y + self.embed_layer(t_embed)
+        return self.activation(self.norm(y))
+# From https://github.com/chanil1218/DCUnet.pytorch/blob/2dcdd30804be47a866fde6435cbb7e2f81585213/models/layers/complexnn.py
+class ComplexBatchNorm(torch.nn.Module):
+    def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True, track_running_stats=False):
+        super(ComplexBatchNorm, self).__init__()
+        self.num_features        = num_features
+        self.eps                 = eps
+        self.momentum            = momentum
+        self.affine              = affine
+        self.track_running_stats = track_running_stats
+        if self.affine:
+            self.Wrr = torch.nn.Parameter(torch.Tensor(num_features))
+            self.Wri = torch.nn.Parameter(torch.Tensor(num_features))
+            self.Wii = torch.nn.Parameter(torch.Tensor(num_features))
+            self.Br  = torch.nn.Parameter(torch.Tensor(num_features))
+            self.Bi  = torch.nn.Parameter(torch.Tensor(num_features))
+        else:
+            self.register_parameter('Wrr', None)
+            self.register_parameter('Wri', None)
+            self.register_parameter('Wii', None)
+            self.register_parameter('Br',  None)
+            self.register_parameter('Bi',  None)
+        if self.track_running_stats:
+            self.register_buffer('RMr',  torch.zeros(num_features))
+            self.register_buffer('RMi',  torch.zeros(num_features))
+            self.register_buffer('RVrr', torch.ones (num_features))
+            self.register_buffer('RVri', torch.zeros(num_features))
+            self.register_buffer('RVii', torch.ones (num_features))
+            self.register_buffer('num_batches_tracked', torch.tensor(0, dtype=torch.long))
+        else:
+            self.register_parameter('RMr',                 None)
+            self.register_parameter('RMi',                 None)
+            self.register_parameter('RVrr',                None)
+            self.register_parameter('RVri',                None)
+            self.register_parameter('RVii',                None)
+            self.register_parameter('num_batches_tracked', None)
+        self.reset_parameters()
+    def reset_running_stats(self):
+        if self.track_running_stats:
+            self.RMr.zero_()
+            self.RMi.zero_()
+            self.RVrr.fill_(1)
+            self.RVri.zero_()
+            self.RVii.fill_(1)
+            self.num_batches_tracked.zero_()
+    def reset_parameters(self):
+        self.reset_running_stats()
+        if self.affine:
+            self.Br.data.zero_()
+            self.Bi.data.zero_()
+            self.Wrr.data.fill_(1)
+            self.Wri.data.uniform_(-.9, +.9) # W will be positive-definite
+            self.Wii.data.fill_(1)
+    def _check_input_dim(self, xr, xi):
+        assert(xr.shape == xi.shape)
+        assert(xr.size(1) == self.num_features)
+    def forward(self, x):
+        xr, xi = x.real, x.imag
+        self._check_input_dim(xr, xi)
+        exponential_average_factor = 0.0
+        if self.training and self.track_running_stats:
+            self.num_batches_tracked += 1
+            if self.momentum is None:  # use cumulative moving average
+                exponential_average_factor = 1.0 / self.num_batches_tracked.item()
+            else:  # use exponential moving average
+                exponential_average_factor = self.momentum
+        #
+        # NOTE: The precise meaning of the "training flag" is:
+        #       True:  Normalize using batch   statistics, update running statistics
+        #              if they are being collected.
+        #       False: Normalize using running statistics, ignore batch   statistics.
+        #
+        training = self.training or not self.track_running_stats
+        redux = [i for i in reversed(range(xr.dim())) if i!=1]
+        vdim  = [1] * xr.dim()
+        vdim[1] = xr.size(1)
+        #
+        # Mean M Computation and Centering
+        #
+        # Includes running mean update if training and running.
+        #
+        if training:
+            Mr, Mi = xr, xi
+            for d in redux:
+                Mr = Mr.mean(d, keepdim=True)
+                Mi = Mi.mean(d, keepdim=True)
+            if self.track_running_stats:
+                self.RMr.lerp_(Mr.squeeze(), exponential_average_factor)
+                self.RMi.lerp_(Mi.squeeze(), exponential_average_factor)
+        else:
+            Mr = self.RMr.view(vdim)
+            Mi = self.RMi.view(vdim)
+        xr, xi = xr-Mr, xi-Mi
+        #
+        # Variance Matrix V Computation
+        #
+        # Includes epsilon numerical stabilizer/Tikhonov regularizer.
+        # Includes running variance update if training and running.
+        #
+        if training:
+            Vrr = xr * xr
+            Vri = xr * xi
+            Vii = xi * xi
+            for d in redux:
+                Vrr = Vrr.mean(d, keepdim=True)
+                Vri = Vri.mean(d, keepdim=True)
+                Vii = Vii.mean(d, keepdim=True)
+            if self.track_running_stats:
+                self.RVrr.lerp_(Vrr.squeeze(), exponential_average_factor)
+                self.RVri.lerp_(Vri.squeeze(), exponential_average_factor)
+                self.RVii.lerp_(Vii.squeeze(), exponential_average_factor)
+        else:
+            Vrr = self.RVrr.view(vdim)
+            Vri = self.RVri.view(vdim)
+            Vii = self.RVii.view(vdim)
+        Vrr   = Vrr + self.eps
+        Vri   = Vri
+        Vii   = Vii + self.eps
+        #
+        # Matrix Inverse Square Root U = V^-0.5
+        #
+        # sqrt of a 2x2 matrix,
+        # - https://en.wikipedia.org/wiki/Square_root_of_a_2_by_2_matrix
+        tau   = Vrr + Vii
+        delta = torch.addcmul(Vrr * Vii, Vri, Vri, value=-1)
+        s     = delta.sqrt()
+        t     = (tau + 2*s).sqrt()
+        # matrix inverse, http://mathworld.wolfram.com/MatrixInverse.html
+        rst   = (s * t).reciprocal()
+        Urr   = (s + Vii) * rst
+        Uii   = (s + Vrr) * rst
+        Uri   = (  - Vri) * rst
+        #
+        # Optionally left-multiply U by affine weights W to produce combined
+        # weights Z, left-multiply the inputs by Z, then optionally bias them.
+        #
+        # y = Zx + B
+        # y = WUx + B
+        # y = [Wrr Wri][Urr Uri] [xr] + [Br]
+        #     [Wir Wii][Uir Uii] [xi]   [Bi]
+        #
+        if self.affine:
+            Wrr, Wri, Wii = self.Wrr.view(vdim), self.Wri.view(vdim), self.Wii.view(vdim)
+            Zrr = (Wrr * Urr) + (Wri * Uri)
+            Zri = (Wrr * Uri) + (Wri * Uii)
+            Zir = (Wri * Urr) + (Wii * Uri)
+            Zii = (Wri * Uri) + (Wii * Uii)
+        else:
+            Zrr, Zri, Zir, Zii = Urr, Uri, Uri, Uii
+        yr = (Zrr * xr) + (Zri * xi)
+        yi = (Zir * xr) + (Zii * xi)
+        if self.affine:
+            yr = yr + self.Br.view(vdim)
+            yi = yi + self.Bi.view(vdim)
+        return torch.view_as_complex(torch.stack([yr, yi], dim=-1))
+    def extra_repr(self):
+        return '{num_features}, eps={eps}, momentum={momentum}, affine={affine}, ' \
+                'track_running_stats={track_running_stats}'.format(**self.__dict__)

sgmse/backbones/ncsnpp.py CHANGED Viewed

@@ -1,420 +1,419 @@
-# coding=utf-8
-# Copyright 2020 The Google Research Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# pylint: skip-file
-from .ncsnpp_utils import layers, layerspp, normalization
-import torch.nn as nn
-import functools
-import torch
-import numpy as np
-from .shared import BackboneRegistry
-ResnetBlockDDPM = layerspp.ResnetBlockDDPMpp
-ResnetBlockBigGAN = layerspp.ResnetBlockBigGANpp
-Combine = layerspp.Combine
-conv3x3 = layerspp.conv3x3
-conv1x1 = layerspp.conv1x1
-get_act = layers.get_act
-get_normalization = normalization.get_normalization
-default_initializer = layers.default_init
-@BackboneRegistry.register("ncsnpp")
-class NCSNpp(nn.Module):
-    """NCSN++ model, adapted from https://github.com/yang-song/score_sde repository"""
-    @staticmethod
-    def add_argparse_args(parser):
-        parser.add_argument("--ch_mult",type=int, nargs='+', default=[1,1,2,2,2,2,2])
-        parser.add_argument("--num_res_blocks", type=int, default=2)
-        parser.add_argument("--attn_resolutions", type=int, nargs='+', default=[16])
-        parser.add_argument("--no-centered", dest="centered", action="store_false", help="The data is not centered [-1, 1]")
-        parser.add_argument("--centered", dest="centered", action="store_true", help="The data is centered [-1, 1]")
-        parser.set_defaults(centered=True)
-        return parser
-    def __init__(self,
-        scale_by_sigma = True,
-        nonlinearity = 'swish',
-        nf = 128,
-        # nf=96,
-        ch_mult = (1, 1, 2, 2, 2, 2, 2),
-        num_res_blocks = 2,
-        attn_resolutions = (16,),
-        resamp_with_conv = True,
-        conditional = True,
-        fir = True,
-        fir_kernel = [1, 3, 3, 1],
-        skip_rescale = True,
-        resblock_type = 'biggan',
-        progressive = 'output_skip',
-        progressive_input = 'input_skip',
-        progressive_combine = 'sum',
-        init_scale = 0.,
-        fourier_scale = 16,
-        image_size = 256,
-        embedding_type = 'fourier',
-        dropout = .0,
-        centered = True,
-        **unused_kwargs
-    ):
-        super().__init__()
-        self.act = act = get_act(nonlinearity)
-        self.nf = nf = nf
-        ch_mult = ch_mult
-        self.num_res_blocks = num_res_blocks = num_res_blocks
-        self.attn_resolutions = attn_resolutions = attn_resolutions
-        dropout = dropout
-        resamp_with_conv = resamp_with_conv
-        self.num_resolutions = num_resolutions = len(ch_mult)
-        self.all_resolutions = all_resolutions = [image_size // (2 ** i) for i in range(num_resolutions)]
-        self.conditional = conditional = conditional  # noise-conditional
-        self.centered = centered
-        self.scale_by_sigma = scale_by_sigma
-        fir = fir
-        fir_kernel = fir_kernel
-        self.skip_rescale = skip_rescale = skip_rescale
-        self.resblock_type = resblock_type = resblock_type.lower()
-        self.progressive = progressive = progressive.lower()
-        self.progressive_input = progressive_input = progressive_input.lower()
-        self.embedding_type = embedding_type = embedding_type.lower()
-        init_scale = init_scale
-        assert progressive in ['none', 'output_skip', 'residual']
-        assert progressive_input in ['none', 'input_skip', 'residual']
-        assert embedding_type in ['fourier', 'positional']
-        combine_method = progressive_combine.lower()
-        combiner = functools.partial(Combine, method=combine_method)
-        num_channels = 4  # x.real, x.imag, y.real, y.imag
-        self.output_layer = nn.Conv2d(num_channels, 2, 1)
-        modules = []
-        # timestep/noise_level embedding
-        if embedding_type == 'fourier':
-            # Gaussian Fourier features embeddings.
-            modules.append(layerspp.GaussianFourierProjection(
-                embedding_size=nf, scale=fourier_scale
-            ))
-            embed_dim = 2 * nf
-        elif embedding_type == 'positional':
-            embed_dim = nf
-        else:
-            raise ValueError(f'embedding type {embedding_type} unknown.')
-        if conditional:
-            modules.append(nn.Linear(embed_dim, nf * 4))
-            modules[-1].weight.data = default_initializer()(modules[-1].weight.shape)
-            nn.init.zeros_(modules[-1].bias)
-            modules.append(nn.Linear(nf * 4, nf * 4))
-            modules[-1].weight.data = default_initializer()(modules[-1].weight.shape)
-            nn.init.zeros_(modules[-1].bias)
-        AttnBlock = functools.partial(layerspp.AttnBlockpp,
-            init_scale=init_scale, skip_rescale=skip_rescale)
-        Upsample = functools.partial(layerspp.Upsample,
-            with_conv=resamp_with_conv, fir=fir, fir_kernel=fir_kernel)
-        if progressive == 'output_skip':
-            self.pyramid_upsample = layerspp.Upsample(fir=fir, fir_kernel=fir_kernel, with_conv=False)
-        elif progressive == 'residual':
-            pyramid_upsample = functools.partial(layerspp.Upsample, fir=fir,
-                fir_kernel=fir_kernel, with_conv=True)
-        Downsample = functools.partial(layerspp.Downsample, with_conv=resamp_with_conv, fir=fir, fir_kernel=fir_kernel)
-        if progressive_input == 'input_skip':
-            self.pyramid_downsample = layerspp.Downsample(fir=fir, fir_kernel=fir_kernel, with_conv=False)
-        elif progressive_input == 'residual':
-            pyramid_downsample = functools.partial(layerspp.Downsample,
-                fir=fir, fir_kernel=fir_kernel, with_conv=True)
-        if resblock_type == 'ddpm':
-            ResnetBlock = functools.partial(ResnetBlockDDPM, act=act,
-                dropout=dropout, init_scale=init_scale,
-                skip_rescale=skip_rescale, temb_dim=nf * 4)
-        elif resblock_type == 'biggan':
-            ResnetBlock = functools.partial(ResnetBlockBigGAN, act=act,
-                dropout=dropout, fir=fir, fir_kernel=fir_kernel,
-                init_scale=init_scale, skip_rescale=skip_rescale, temb_dim=nf * 4)
-        else:
-            raise ValueError(f'resblock type {resblock_type} unrecognized.')
-        # Downsampling block
-        channels = num_channels
-        if progressive_input != 'none':
-            input_pyramid_ch = channels
-        modules.append(conv3x3(channels, nf))
-        hs_c = [nf]
-        in_ch = nf
-        for i_level in range(num_resolutions):
-            # Residual blocks for this resolution
-            for i_block in range(num_res_blocks):
-                out_ch = nf * ch_mult[i_level]
-                modules.append(ResnetBlock(in_ch=in_ch, out_ch=out_ch))
-                in_ch = out_ch
-                if all_resolutions[i_level] in attn_resolutions:
-                    modules.append(AttnBlock(channels=in_ch))
-                hs_c.append(in_ch)
-            if i_level != num_resolutions - 1:
-                if resblock_type == 'ddpm':
-                    modules.append(Downsample(in_ch=in_ch))
-                else:
-                    modules.append(ResnetBlock(down=True, in_ch=in_ch))
-                if progressive_input == 'input_skip':
-                    modules.append(combiner(dim1=input_pyramid_ch, dim2=in_ch))
-                    if combine_method == 'cat':
-                        in_ch *= 2
-                elif progressive_input == 'residual':
-                    modules.append(pyramid_downsample(in_ch=input_pyramid_ch, out_ch=in_ch))
-                    input_pyramid_ch = in_ch
-                hs_c.append(in_ch)
-        in_ch = hs_c[-1]
-        modules.append(ResnetBlock(in_ch=in_ch))
-        modules.append(AttnBlock(channels=in_ch))
-        modules.append(ResnetBlock(in_ch=in_ch))
-        pyramid_ch = 0
-        # Upsampling block
-        for i_level in reversed(range(num_resolutions)):
-            for i_block in range(num_res_blocks + 1):  # +1 blocks in upsampling because of skip connection from combiner (after downsampling)
-                out_ch = nf * ch_mult[i_level]
-                modules.append(ResnetBlock(in_ch=in_ch + hs_c.pop(), out_ch=out_ch))
-                in_ch = out_ch
-            if all_resolutions[i_level] in attn_resolutions:
-                modules.append(AttnBlock(channels=in_ch))
-            if progressive != 'none':
-                if i_level == num_resolutions - 1:
-                    if progressive == 'output_skip':
-                        modules.append(nn.GroupNorm(num_groups=min(in_ch // 4, 32),
-                            num_channels=in_ch, eps=1e-6))
-                        modules.append(conv3x3(in_ch, channels, init_scale=init_scale))
-                        pyramid_ch = channels
-                    elif progressive == 'residual':
-                        modules.append(nn.GroupNorm(num_groups=min(in_ch // 4, 32), num_channels=in_ch, eps=1e-6))
-                        modules.append(conv3x3(in_ch, in_ch, bias=True))
-                        pyramid_ch = in_ch
-                    else:
-                        raise ValueError(f'{progressive} is not a valid name.')
-                else:
-                    if progressive == 'output_skip':
-                        modules.append(nn.GroupNorm(num_groups=min(in_ch // 4, 32),
-                            num_channels=in_ch, eps=1e-6))
-                        modules.append(conv3x3(in_ch, channels, bias=True, init_scale=init_scale))
-                        pyramid_ch = channels
-                    elif progressive == 'residual':
-                        modules.append(pyramid_upsample(in_ch=pyramid_ch, out_ch=in_ch))
-                        pyramid_ch = in_ch
-                    else:
-                        raise ValueError(f'{progressive} is not a valid name')
-            if i_level != 0:
-                if resblock_type == 'ddpm':
-                    modules.append(Upsample(in_ch=in_ch))
-                else:
-                    modules.append(ResnetBlock(in_ch=in_ch, up=True))
-        assert not hs_c
-        if progressive != 'output_skip':
-            modules.append(nn.GroupNorm(num_groups=min(in_ch // 4, 32),
-                                                                    num_channels=in_ch, eps=1e-6))
-            modules.append(conv3x3(in_ch, channels, init_scale=init_scale))
-        self.all_modules = nn.ModuleList(modules)
-    def forward(self, x, time_cond):
-        # timestep/noise_level embedding; only for continuous training
-        modules = self.all_modules
-        m_idx = 0
-        # Convert real and imaginary parts of (x,y) into four channel dimensions
-        x = torch.cat((x[:,[0],:,:].real, x[:,[0],:,:].imag,
-                x[:,[1],:,:].real, x[:,[1],:,:].imag), dim=1)
-        if self.embedding_type == 'fourier':
-            # Gaussian Fourier features embeddings.
-            used_sigmas = time_cond
-            temb = modules[m_idx](torch.log(used_sigmas))
-            m_idx += 1
-        elif self.embedding_type == 'positional':
-            # Sinusoidal positional embeddings.
-            timesteps = time_cond
-            used_sigmas = self.sigmas[time_cond.long()]
-            temb = layers.get_timestep_embedding(timesteps, self.nf)
-        else:
-            raise ValueError(f'embedding type {self.embedding_type} unknown.')
-        if self.conditional:
-            temb = modules[m_idx](temb)
-            m_idx += 1
-            temb = modules[m_idx](self.act(temb))
-            m_idx += 1
-        else:
-            temb = None
-        if not self.centered:
-            # If input data is in [0, 1]
-            x = 2 * x - 1.
-        # Downsampling block
-        input_pyramid = None
-        if self.progressive_input != 'none':
-            input_pyramid = x
-        # Input layer: Conv2d: 4ch -> 128ch
-        hs = [modules[m_idx](x)]
-        m_idx += 1
-        # Down path in U-Net
-        for i_level in range(self.num_resolutions):
-            # Residual blocks for this resolution
-            for i_block in range(self.num_res_blocks):
-                h = modules[m_idx](hs[-1], temb)
-                m_idx += 1
-                # Attention layer (optional)
-                if h.shape[-2] in self.attn_resolutions: # edit: check H dim (-2) not W dim (-1)
-                    h = modules[m_idx](h)
-                    m_idx += 1
-                hs.append(h)
-            # Downsampling
-            if i_level != self.num_resolutions - 1:
-                if self.resblock_type == 'ddpm':
-                    h = modules[m_idx](hs[-1])
-                    m_idx += 1
-                else:
-                    h = modules[m_idx](hs[-1], temb)
-                    m_idx += 1
-                if self.progressive_input == 'input_skip':   # Combine h with x
-                    input_pyramid = self.pyramid_downsample(input_pyramid)
-                    h = modules[m_idx](input_pyramid, h)
-                    m_idx += 1
-                elif self.progressive_input == 'residual':
-                    input_pyramid = modules[m_idx](input_pyramid)
-                    m_idx += 1
-                    if self.skip_rescale:
-                        input_pyramid = (input_pyramid + h) / np.sqrt(2.)
-                    else:
-                        input_pyramid = input_pyramid + h
-                    h = input_pyramid
-                hs.append(h)
-        h = hs[-1] # actualy equal to: h = h
-        h = modules[m_idx](h, temb)  # ResNet block
-        m_idx += 1
-        h = modules[m_idx](h)  # Attention block
-        m_idx += 1
-        h = modules[m_idx](h, temb)  # ResNet block
-        m_idx += 1
-        pyramid = None
-        # Upsampling block
-        for i_level in reversed(range(self.num_resolutions)):
-            for i_block in range(self.num_res_blocks + 1):
-                h = modules[m_idx](torch.cat([h, hs.pop()], dim=1), temb)
-                m_idx += 1
-            # edit: from -1 to -2
-            if h.shape[-2] in self.attn_resolutions:
-                h = modules[m_idx](h)
-                m_idx += 1
-            if self.progressive != 'none':
-                if i_level == self.num_resolutions - 1:
-                    if self.progressive == 'output_skip':
-                        pyramid = self.act(modules[m_idx](h))  # GroupNorm
-                        m_idx += 1
-                        pyramid = modules[m_idx](pyramid)  # Conv2D: 256 -> 4
-                        m_idx += 1
-                    elif self.progressive == 'residual':
-                        pyramid = self.act(modules[m_idx](h))
-                        m_idx += 1
-                        pyramid = modules[m_idx](pyramid)
-                        m_idx += 1
-                    else:
-                        raise ValueError(f'{self.progressive} is not a valid name.')
-                else:
-                    if self.progressive == 'output_skip':
-                        pyramid = self.pyramid_upsample(pyramid)  # Upsample
-                        pyramid_h = self.act(modules[m_idx](h))  # GroupNorm
-                        m_idx += 1
-                        pyramid_h = modules[m_idx](pyramid_h)
-                        m_idx += 1
-                        pyramid = pyramid + pyramid_h
-                    elif self.progressive == 'residual':
-                        pyramid = modules[m_idx](pyramid)
-                        m_idx += 1
-                        if self.skip_rescale:
-                            pyramid = (pyramid + h) / np.sqrt(2.)
-                        else:
-                            pyramid = pyramid + h
-                        h = pyramid
-                    else:
-                        raise ValueError(f'{self.progressive} is not a valid name')
-            # Upsampling Layer
-            if i_level != 0:
-                if self.resblock_type == 'ddpm':
-                    h = modules[m_idx](h)
-                    m_idx += 1
-                else:
-                    h = modules[m_idx](h, temb)  # Upspampling
-                    m_idx += 1
-        assert not hs
-        if self.progressive == 'output_skip':
-            h = pyramid
-        else:
-            h = self.act(modules[m_idx](h))
-            m_idx += 1
-            h = modules[m_idx](h)
-            m_idx += 1
-        assert m_idx == len(modules), "Implementation error"
-        if self.scale_by_sigma:
-            used_sigmas = used_sigmas.reshape((x.shape[0], *([1] * len(x.shape[1:]))))
-            h = h / used_sigmas
-        # Convert back to complex number
-        h = self.output_layer(h)
-        h = torch.permute(h, (0, 2, 3, 1)).contiguous()
-        h = torch.view_as_complex(h)[:,None, :, :]
-        return h

+# coding=utf-8
+# Copyright 2020 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: skip-file
+from .ncsnpp_utils import layers, layerspp, normalization
+import torch.nn as nn
+import functools
+import torch
+import numpy as np
+from .shared import BackboneRegistry
+ResnetBlockDDPM = layerspp.ResnetBlockDDPMpp
+ResnetBlockBigGAN = layerspp.ResnetBlockBigGANpp
+Combine = layerspp.Combine
+conv3x3 = layerspp.conv3x3
+conv1x1 = layerspp.conv1x1
+get_act = layers.get_act
+get_normalization = normalization.get_normalization
+default_initializer = layers.default_init
+@BackboneRegistry.register("ncsnpp")
+class NCSNpp(nn.Module):
+    """NCSN++ model, adapted from https://github.com/yang-song/score_sde repository"""
+    @staticmethod
+    def add_argparse_args(parser):
+        parser.add_argument("--ch_mult",type=int, nargs='+', default=[1,1,2,2,2,2,2])
+        parser.add_argument("--num_res_blocks", type=int, default=2)
+        parser.add_argument("--attn_resolutions", type=int, nargs='+', default=[16])
+        parser.add_argument("--no-centered", dest="centered", action="store_false", help="The data is not centered [-1, 1]")
+        parser.add_argument("--centered", dest="centered", action="store_true", help="The data is centered [-1, 1]")
+        parser.set_defaults(centered=True)
+        return parser
+    def __init__(self,
+        scale_by_sigma = True,
+        nonlinearity = 'swish',
+        nf = 128,
+        ch_mult = (1, 1, 2, 2, 2, 2, 2),
+        num_res_blocks = 2,
+        attn_resolutions = (16,),
+        resamp_with_conv = True,
+        conditional = True,
+        fir = True,
+        fir_kernel = [1, 3, 3, 1],
+        skip_rescale = True,
+        resblock_type = 'biggan',
+        progressive = 'output_skip',
+        progressive_input = 'input_skip',
+        progressive_combine = 'sum',
+        init_scale = 0.,
+        fourier_scale = 16,
+        image_size = 256,
+        embedding_type = 'fourier',
+        dropout = .0,
+        centered = True,
+        **unused_kwargs
+    ):
+        super().__init__()
+        self.act = act = get_act(nonlinearity)
+        self.nf = nf = nf
+        ch_mult = ch_mult
+        self.num_res_blocks = num_res_blocks = num_res_blocks
+        self.attn_resolutions = attn_resolutions = attn_resolutions
+        dropout = dropout
+        resamp_with_conv = resamp_with_conv
+        self.num_resolutions = num_resolutions = len(ch_mult)
+        self.all_resolutions = all_resolutions = [image_size // (2 ** i) for i in range(num_resolutions)]
+        self.conditional = conditional = conditional  # noise-conditional
+        self.centered = centered
+        self.scale_by_sigma = scale_by_sigma
+        fir = fir
+        fir_kernel = fir_kernel
+        self.skip_rescale = skip_rescale = skip_rescale
+        self.resblock_type = resblock_type = resblock_type.lower()
+        self.progressive = progressive = progressive.lower()
+        self.progressive_input = progressive_input = progressive_input.lower()
+        self.embedding_type = embedding_type = embedding_type.lower()
+        init_scale = init_scale
+        assert progressive in ['none', 'output_skip', 'residual']
+        assert progressive_input in ['none', 'input_skip', 'residual']
+        assert embedding_type in ['fourier', 'positional']
+        combine_method = progressive_combine.lower()
+        combiner = functools.partial(Combine, method=combine_method)
+        num_channels = 4  # x.real, x.imag, y.real, y.imag
+        self.output_layer = nn.Conv2d(num_channels, 2, 1)
+        modules = []
+        # timestep/noise_level embedding
+        if embedding_type == 'fourier':
+            # Gaussian Fourier features embeddings.
+            modules.append(layerspp.GaussianFourierProjection(
+                embedding_size=nf, scale=fourier_scale
+            ))
+            embed_dim = 2 * nf
+        elif embedding_type == 'positional':
+            embed_dim = nf
+        else:
+            raise ValueError(f'embedding type {embedding_type} unknown.')
+        if conditional:
+            modules.append(nn.Linear(embed_dim, nf * 4))
+            modules[-1].weight.data = default_initializer()(modules[-1].weight.shape)
+            nn.init.zeros_(modules[-1].bias)
+            modules.append(nn.Linear(nf * 4, nf * 4))
+            modules[-1].weight.data = default_initializer()(modules[-1].weight.shape)
+            nn.init.zeros_(modules[-1].bias)
+        AttnBlock = functools.partial(layerspp.AttnBlockpp,
+            init_scale=init_scale, skip_rescale=skip_rescale)
+        Upsample = functools.partial(layerspp.Upsample,
+            with_conv=resamp_with_conv, fir=fir, fir_kernel=fir_kernel)
+        if progressive == 'output_skip':
+            self.pyramid_upsample = layerspp.Upsample(fir=fir, fir_kernel=fir_kernel, with_conv=False)
+        elif progressive == 'residual':
+            pyramid_upsample = functools.partial(layerspp.Upsample, fir=fir,
+                fir_kernel=fir_kernel, with_conv=True)
+        Downsample = functools.partial(layerspp.Downsample, with_conv=resamp_with_conv, fir=fir, fir_kernel=fir_kernel)
+        if progressive_input == 'input_skip':
+            self.pyramid_downsample = layerspp.Downsample(fir=fir, fir_kernel=fir_kernel, with_conv=False)
+        elif progressive_input == 'residual':
+            pyramid_downsample = functools.partial(layerspp.Downsample,
+                fir=fir, fir_kernel=fir_kernel, with_conv=True)
+        if resblock_type == 'ddpm':
+            ResnetBlock = functools.partial(ResnetBlockDDPM, act=act,
+                dropout=dropout, init_scale=init_scale,
+                skip_rescale=skip_rescale, temb_dim=nf * 4)
+        elif resblock_type == 'biggan':
+            ResnetBlock = functools.partial(ResnetBlockBigGAN, act=act,
+                dropout=dropout, fir=fir, fir_kernel=fir_kernel,
+                init_scale=init_scale, skip_rescale=skip_rescale, temb_dim=nf * 4)
+        else:
+            raise ValueError(f'resblock type {resblock_type} unrecognized.')
+        # Downsampling block
+        channels = num_channels
+        if progressive_input != 'none':
+            input_pyramid_ch = channels
+        modules.append(conv3x3(channels, nf))
+        hs_c = [nf]
+        in_ch = nf
+        for i_level in range(num_resolutions):
+            # Residual blocks for this resolution
+            for i_block in range(num_res_blocks):
+                out_ch = nf * ch_mult[i_level]
+                modules.append(ResnetBlock(in_ch=in_ch, out_ch=out_ch))
+                in_ch = out_ch
+                if all_resolutions[i_level] in attn_resolutions:
+                    modules.append(AttnBlock(channels=in_ch))
+                hs_c.append(in_ch)
+            if i_level != num_resolutions - 1:
+                if resblock_type == 'ddpm':
+                    modules.append(Downsample(in_ch=in_ch))
+                else:
+                    modules.append(ResnetBlock(down=True, in_ch=in_ch))
+                if progressive_input == 'input_skip':
+                    modules.append(combiner(dim1=input_pyramid_ch, dim2=in_ch))
+                    if combine_method == 'cat':
+                        in_ch *= 2
+                elif progressive_input == 'residual':
+                    modules.append(pyramid_downsample(in_ch=input_pyramid_ch, out_ch=in_ch))
+                    input_pyramid_ch = in_ch
+                hs_c.append(in_ch)
+        in_ch = hs_c[-1]
+        modules.append(ResnetBlock(in_ch=in_ch))
+        modules.append(AttnBlock(channels=in_ch))
+        modules.append(ResnetBlock(in_ch=in_ch))
+        pyramid_ch = 0
+        # Upsampling block
+        for i_level in reversed(range(num_resolutions)):
+            for i_block in range(num_res_blocks + 1):  # +1 blocks in upsampling because of skip connection from combiner (after downsampling)
+                out_ch = nf * ch_mult[i_level]
+                modules.append(ResnetBlock(in_ch=in_ch + hs_c.pop(), out_ch=out_ch))
+                in_ch = out_ch
+            if all_resolutions[i_level] in attn_resolutions:
+                modules.append(AttnBlock(channels=in_ch))
+            if progressive != 'none':
+                if i_level == num_resolutions - 1:
+                    if progressive == 'output_skip':
+                        modules.append(nn.GroupNorm(num_groups=min(in_ch // 4, 32),
+                            num_channels=in_ch, eps=1e-6))
+                        modules.append(conv3x3(in_ch, channels, init_scale=init_scale))
+                        pyramid_ch = channels
+                    elif progressive == 'residual':
+                        modules.append(nn.GroupNorm(num_groups=min(in_ch // 4, 32), num_channels=in_ch, eps=1e-6))
+                        modules.append(conv3x3(in_ch, in_ch, bias=True))
+                        pyramid_ch = in_ch
+                    else:
+                        raise ValueError(f'{progressive} is not a valid name.')
+                else:
+                    if progressive == 'output_skip':
+                        modules.append(nn.GroupNorm(num_groups=min(in_ch // 4, 32),
+                            num_channels=in_ch, eps=1e-6))
+                        modules.append(conv3x3(in_ch, channels, bias=True, init_scale=init_scale))
+                        pyramid_ch = channels
+                    elif progressive == 'residual':
+                        modules.append(pyramid_upsample(in_ch=pyramid_ch, out_ch=in_ch))
+                        pyramid_ch = in_ch
+                    else:
+                        raise ValueError(f'{progressive} is not a valid name')
+            if i_level != 0:
+                if resblock_type == 'ddpm':
+                    modules.append(Upsample(in_ch=in_ch))
+                else:
+                    modules.append(ResnetBlock(in_ch=in_ch, up=True))
+        assert not hs_c
+        if progressive != 'output_skip':
+            modules.append(nn.GroupNorm(num_groups=min(in_ch // 4, 32),
+                                                                    num_channels=in_ch, eps=1e-6))
+            modules.append(conv3x3(in_ch, channels, init_scale=init_scale))
+        self.all_modules = nn.ModuleList(modules)
+    def forward(self, x, time_cond):
+        # timestep/noise_level embedding; only for continuous training
+        modules = self.all_modules
+        m_idx = 0
+        # Convert real and imaginary parts of (x,y) into four channel dimensions
+        x = torch.cat((x[:,[0],:,:].real, x[:,[0],:,:].imag,
+                x[:,[1],:,:].real, x[:,[1],:,:].imag), dim=1)
+        if self.embedding_type == 'fourier':
+            # Gaussian Fourier features embeddings.
+            used_sigmas = time_cond
+            temb = modules[m_idx](torch.log(used_sigmas))
+            m_idx += 1
+        elif self.embedding_type == 'positional':
+            # Sinusoidal positional embeddings.
+            timesteps = time_cond
+            used_sigmas = self.sigmas[time_cond.long()]
+            temb = layers.get_timestep_embedding(timesteps, self.nf)
+        else:
+            raise ValueError(f'embedding type {self.embedding_type} unknown.')
+        if self.conditional:
+            temb = modules[m_idx](temb)
+            m_idx += 1
+            temb = modules[m_idx](self.act(temb))
+            m_idx += 1
+        else:
+            temb = None
+        if not self.centered:
+            # If input data is in [0, 1]
+            x = 2 * x - 1.
+        # Downsampling block
+        input_pyramid = None
+        if self.progressive_input != 'none':
+            input_pyramid = x
+        # Input layer: Conv2d: 4ch -> 128ch
+        hs = [modules[m_idx](x)]
+        m_idx += 1
+        # Down path in U-Net
+        for i_level in range(self.num_resolutions):
+            # Residual blocks for this resolution
+            for i_block in range(self.num_res_blocks):
+                h = modules[m_idx](hs[-1], temb)
+                m_idx += 1
+                # Attention layer (optional)
+                if h.shape[-2] in self.attn_resolutions: # edit: check H dim (-2) not W dim (-1)
+                    h = modules[m_idx](h)
+                    m_idx += 1
+                hs.append(h)
+            # Downsampling
+            if i_level != self.num_resolutions - 1:
+                if self.resblock_type == 'ddpm':
+                    h = modules[m_idx](hs[-1])
+                    m_idx += 1
+                else:
+                    h = modules[m_idx](hs[-1], temb)
+                    m_idx += 1
+                if self.progressive_input == 'input_skip':   # Combine h with x
+                    input_pyramid = self.pyramid_downsample(input_pyramid)
+                    h = modules[m_idx](input_pyramid, h)
+                    m_idx += 1
+                elif self.progressive_input == 'residual':
+                    input_pyramid = modules[m_idx](input_pyramid)
+                    m_idx += 1
+                    if self.skip_rescale:
+                        input_pyramid = (input_pyramid + h) / np.sqrt(2.)
+                    else:
+                        input_pyramid = input_pyramid + h
+                    h = input_pyramid
+                hs.append(h)
+        h = hs[-1] # actualy equal to: h = h
+        h = modules[m_idx](h, temb)  # ResNet block
+        m_idx += 1
+        h = modules[m_idx](h)  # Attention block
+        m_idx += 1
+        h = modules[m_idx](h, temb)  # ResNet block
+        m_idx += 1
+        pyramid = None
+        # Upsampling block
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks + 1):
+                h = modules[m_idx](torch.cat([h, hs.pop()], dim=1), temb)
+                m_idx += 1
+            # edit: from -1 to -2
+            if h.shape[-2] in self.attn_resolutions:
+                h = modules[m_idx](h)
+                m_idx += 1
+            if self.progressive != 'none':
+                if i_level == self.num_resolutions - 1:
+                    if self.progressive == 'output_skip':
+                        pyramid = self.act(modules[m_idx](h))  # GroupNorm
+                        m_idx += 1
+                        pyramid = modules[m_idx](pyramid)  # Conv2D: 256 -> 4
+                        m_idx += 1
+                    elif self.progressive == 'residual':
+                        pyramid = self.act(modules[m_idx](h))
+                        m_idx += 1
+                        pyramid = modules[m_idx](pyramid)
+                        m_idx += 1
+                    else:
+                        raise ValueError(f'{self.progressive} is not a valid name.')
+                else:
+                    if self.progressive == 'output_skip':
+                        pyramid = self.pyramid_upsample(pyramid)  # Upsample
+                        pyramid_h = self.act(modules[m_idx](h))  # GroupNorm
+                        m_idx += 1
+                        pyramid_h = modules[m_idx](pyramid_h)
+                        m_idx += 1
+                        pyramid = pyramid + pyramid_h
+                    elif self.progressive == 'residual':
+                        pyramid = modules[m_idx](pyramid)
+                        m_idx += 1
+                        if self.skip_rescale:
+                            pyramid = (pyramid + h) / np.sqrt(2.)
+                        else:
+                            pyramid = pyramid + h
+                        h = pyramid
+                    else:
+                        raise ValueError(f'{self.progressive} is not a valid name')
+            # Upsampling Layer
+            if i_level != 0:
+                if self.resblock_type == 'ddpm':
+                    h = modules[m_idx](h)
+                    m_idx += 1
+                else:
+                    h = modules[m_idx](h, temb)  # Upspampling
+                    m_idx += 1
+        assert not hs
+        if self.progressive == 'output_skip':
+            h = pyramid
+        else:
+            h = self.act(modules[m_idx](h))
+            m_idx += 1
+            h = modules[m_idx](h)
+            m_idx += 1
+        assert m_idx == len(modules), "Implementation error"
+        if self.scale_by_sigma:
+            used_sigmas = used_sigmas.reshape((x.shape[0], *([1] * len(x.shape[1:]))))
+            h = h / used_sigmas
+        # Convert back to complex number
+        h = self.output_layer(h)
+        h = torch.permute(h, (0, 2, 3, 1)).contiguous()
+        h = torch.view_as_complex(h)[:,None, :, :]
+        return h

sgmse/backbones/ncsnpp_48k.py CHANGED Viewed

@@ -1,424 +1,424 @@
-# coding=utf-8
-# Copyright 2020 The Google Research Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# pylint: skip-file
-from .ncsnpp_utils import layers, layerspp, normalization
-import torch.nn as nn
-import functools
-import torch
-import numpy as np
-from .shared import BackboneRegistry
-ResnetBlockDDPM = layerspp.ResnetBlockDDPMpp
-ResnetBlockBigGAN = layerspp.ResnetBlockBigGANpp
-Combine = layerspp.Combine
-conv3x3 = layerspp.conv3x3
-conv1x1 = layerspp.conv1x1
-get_act = layers.get_act
-get_normalization = normalization.get_normalization
-default_initializer = layers.default_init
-@BackboneRegistry.register("ncsnpp_48k")
-class NCSNpp_48k(nn.Module):
-    """NCSN++ model, adapted from https://github.com/yang-song/score_sde repository"""
-    @staticmethod
-    def add_argparse_args(parser):
-        parser.add_argument("--ch_mult",type=int, nargs='+', default=[1,1,2,2,2,2,2])
-        parser.add_argument("--num_res_blocks", type=int, default=2)
-        parser.add_argument("--attn_resolutions", type=int, nargs='+', default=[])
-        parser.add_argument("--nf", type=int, default=128, help="Number of channels to use in the model")
-        parser.add_argument("--no-centered", dest="centered", action="store_false", help="The data is not centered [-1, 1]")
-        parser.add_argument("--centered", dest="centered", action="store_true", help="The data is centered [-1, 1]")
-        parser.add_argument("--progressive", type=str, default='none', help="Progressive downsampling method")
-        parser.add_argument("--progressive_input", type=str, default='none', help="Progressive upsampling method")
-        parser.set_defaults(centered=True)
-        return parser
-    def __init__(self,
-        scale_by_sigma = True,
-        nonlinearity = 'swish',
-        nf = 128,
-        ch_mult = (1, 1, 2, 2, 2, 2, 2),
-        num_res_blocks = 2,
-        attn_resolutions = (),
-        resamp_with_conv = True,
-        conditional = True,
-        fir = True,
-        fir_kernel = [1, 3, 3, 1],
-        skip_rescale = True,
-        resblock_type = 'biggan',
-        progressive = 'none',
-        progressive_input = 'none',
-        progressive_combine = 'sum',
-        init_scale = 0.,
-        fourier_scale = 16,
-        image_size = 256,
-        embedding_type = 'fourier',
-        dropout = .0,
-        centered = True,
-        **unused_kwargs
-    ):
-        super().__init__()
-        self.act = act = get_act(nonlinearity)
-        self.nf = nf = nf
-        ch_mult = ch_mult
-        self.num_res_blocks = num_res_blocks = num_res_blocks
-        self.attn_resolutions = attn_resolutions
-        dropout = dropout
-        resamp_with_conv = resamp_with_conv
-        self.num_resolutions = num_resolutions = len(ch_mult)
-        self.all_resolutions = all_resolutions = [image_size // (2 ** i) for i in range(num_resolutions)]
-        self.conditional = conditional = conditional  # noise-conditional
-        self.centered = centered
-        self.scale_by_sigma = scale_by_sigma
-        fir = fir
-        fir_kernel = fir_kernel
-        self.skip_rescale = skip_rescale = skip_rescale
-        self.resblock_type = resblock_type = resblock_type.lower()
-        self.progressive = progressive = progressive.lower()
-        self.progressive_input = progressive_input = progressive_input.lower()
-        self.embedding_type = embedding_type = embedding_type.lower()
-        init_scale = init_scale
-        assert progressive in ['none', 'output_skip', 'residual']
-        assert progressive_input in ['none', 'input_skip', 'residual']
-        assert embedding_type in ['fourier', 'positional']
-        combine_method = progressive_combine.lower()
-        combiner = functools.partial(Combine, method=combine_method)
-        num_channels = 4  # x.real, x.imag, y.real, y.imag
-        self.output_layer = nn.Conv2d(num_channels, 2, 1)
-        modules = []
-        # timestep/noise_level embedding
-        if embedding_type == 'fourier':
-            # Gaussian Fourier features embeddings.
-            modules.append(layerspp.GaussianFourierProjection(
-                embedding_size=nf, scale=fourier_scale
-            ))
-            embed_dim = 2 * nf
-        elif embedding_type == 'positional':
-            embed_dim = nf
-        else:
-            raise ValueError(f'embedding type {embedding_type} unknown.')
-        if conditional:
-            modules.append(nn.Linear(embed_dim, nf * 4))
-            modules[-1].weight.data = default_initializer()(modules[-1].weight.shape)
-            nn.init.zeros_(modules[-1].bias)
-            modules.append(nn.Linear(nf * 4, nf * 4))
-            modules[-1].weight.data = default_initializer()(modules[-1].weight.shape)
-            nn.init.zeros_(modules[-1].bias)
-        AttnBlock = functools.partial(layerspp.AttnBlockpp,
-            init_scale=init_scale, skip_rescale=skip_rescale)
-        Upsample = functools.partial(layerspp.Upsample,
-            with_conv=resamp_with_conv, fir=fir, fir_kernel=fir_kernel)
-        if progressive == 'output_skip':
-            self.pyramid_upsample = layerspp.Upsample(fir=fir, fir_kernel=fir_kernel, with_conv=False)
-        elif progressive == 'residual':
-            pyramid_upsample = functools.partial(layerspp.Upsample, fir=fir,
-                fir_kernel=fir_kernel, with_conv=True)
-        Downsample = functools.partial(layerspp.Downsample, with_conv=resamp_with_conv, fir=fir, fir_kernel=fir_kernel)
-        if progressive_input == 'input_skip':
-            self.pyramid_downsample = layerspp.Downsample(fir=fir, fir_kernel=fir_kernel, with_conv=False)
-        elif progressive_input == 'residual':
-            pyramid_downsample = functools.partial(layerspp.Downsample,
-                fir=fir, fir_kernel=fir_kernel, with_conv=True)
-        if resblock_type == 'ddpm':
-            ResnetBlock = functools.partial(ResnetBlockDDPM, act=act,
-                dropout=dropout, init_scale=init_scale,
-                skip_rescale=skip_rescale, temb_dim=nf * 4)
-        elif resblock_type == 'biggan':
-            ResnetBlock = functools.partial(ResnetBlockBigGAN, act=act,
-                dropout=dropout, fir=fir, fir_kernel=fir_kernel,
-                init_scale=init_scale, skip_rescale=skip_rescale, temb_dim=nf * 4)
-        else:
-            raise ValueError(f'resblock type {resblock_type} unrecognized.')
-        # Downsampling block
-        channels = num_channels
-        if progressive_input != 'none':
-            input_pyramid_ch = channels
-        modules.append(conv3x3(channels, nf))
-        hs_c = [nf]
-        in_ch = nf
-        for i_level in range(num_resolutions):
-            # Residual blocks for this resolution
-            for i_block in range(num_res_blocks):
-                out_ch = nf * ch_mult[i_level]
-                modules.append(ResnetBlock(in_ch=in_ch, out_ch=out_ch))
-                in_ch = out_ch
-                if all_resolutions[i_level] in attn_resolutions:
-                    modules.append(AttnBlock(channels=in_ch))
-                hs_c.append(in_ch)
-            if i_level != num_resolutions - 1:
-                if resblock_type == 'ddpm':
-                    modules.append(Downsample(in_ch=in_ch))
-                else:
-                    modules.append(ResnetBlock(down=True, in_ch=in_ch))
-                if progressive_input == 'input_skip':
-                    modules.append(combiner(dim1=input_pyramid_ch, dim2=in_ch))
-                    if combine_method == 'cat':
-                        in_ch *= 2
-                elif progressive_input == 'residual':
-                    modules.append(pyramid_downsample(in_ch=input_pyramid_ch, out_ch=in_ch))
-                    input_pyramid_ch = in_ch
-                hs_c.append(in_ch)
-        in_ch = hs_c[-1]
-        modules.append(ResnetBlock(in_ch=in_ch))
-        modules.append(AttnBlock(channels=in_ch))
-        modules.append(ResnetBlock(in_ch=in_ch))
-        pyramid_ch = 0
-        # Upsampling block
-        for i_level in reversed(range(num_resolutions)):
-            for i_block in range(num_res_blocks + 1):  # +1 blocks in upsampling because of skip connection from combiner (after downsampling)
-                out_ch = nf * ch_mult[i_level]
-                modules.append(ResnetBlock(in_ch=in_ch + hs_c.pop(), out_ch=out_ch))
-                in_ch = out_ch
-            if all_resolutions[i_level] in attn_resolutions:
-                modules.append(AttnBlock(channels=in_ch))
-            if progressive != 'none':
-                if i_level == num_resolutions - 1:
-                    if progressive == 'output_skip':
-                        modules.append(nn.GroupNorm(num_groups=min(in_ch // 4, 32),
-                            num_channels=in_ch, eps=1e-6))
-                        modules.append(conv3x3(in_ch, channels, init_scale=init_scale))
-                        pyramid_ch = channels
-                    elif progressive == 'residual':
-                        modules.append(nn.GroupNorm(num_groups=min(in_ch // 4, 32), num_channels=in_ch, eps=1e-6))
-                        modules.append(conv3x3(in_ch, in_ch, bias=True))
-                        pyramid_ch = in_ch
-                    else:
-                        raise ValueError(f'{progressive} is not a valid name.')
-                else:
-                    if progressive == 'output_skip':
-                        modules.append(nn.GroupNorm(num_groups=min(in_ch // 4, 32),
-                            num_channels=in_ch, eps=1e-6))
-                        modules.append(conv3x3(in_ch, channels, bias=True, init_scale=init_scale))
-                        pyramid_ch = channels
-                    elif progressive == 'residual':
-                        modules.append(pyramid_upsample(in_ch=pyramid_ch, out_ch=in_ch))
-                        pyramid_ch = in_ch
-                    else:
-                        raise ValueError(f'{progressive} is not a valid name')
-            if i_level != 0:
-                if resblock_type == 'ddpm':
-                    modules.append(Upsample(in_ch=in_ch))
-                else:
-                    modules.append(ResnetBlock(in_ch=in_ch, up=True))
-        assert not hs_c
-        if progressive != 'output_skip':
-            modules.append(nn.GroupNorm(num_groups=min(in_ch // 4, 32),
-                                                                    num_channels=in_ch, eps=1e-6))
-            modules.append(conv3x3(in_ch, channels, init_scale=init_scale))
-        self.all_modules = nn.ModuleList(modules)
-    def forward(self, x, time_cond):
-        # timestep/noise_level embedding; only for continuous training
-        modules = self.all_modules
-        m_idx = 0
-        # Convert real and imaginary parts of (x,y) into four channel dimensions
-        x = torch.cat((x[:,[0],:,:].real, x[:,[0],:,:].imag,
-                x[:,[1],:,:].real, x[:,[1],:,:].imag), dim=1)
-        if self.embedding_type == 'fourier':
-            # Gaussian Fourier features embeddings.
-            used_sigmas = time_cond
-            temb = modules[m_idx](torch.log(used_sigmas))
-            m_idx += 1
-        elif self.embedding_type == 'positional':
-            # Sinusoidal positional embeddings.
-            timesteps = time_cond
-            used_sigmas = self.sigmas[time_cond.long()]
-            temb = layers.get_timestep_embedding(timesteps, self.nf)
-        else:
-            raise ValueError(f'embedding type {self.embedding_type} unknown.')
-        if self.conditional:
-            temb = modules[m_idx](temb)
-            m_idx += 1
-            temb = modules[m_idx](self.act(temb))
-            m_idx += 1
-        else:
-            temb = None
-        if not self.centered:
-            # If input data is in [0, 1]
-            x = 2 * x - 1.
-        # Downsampling block
-        input_pyramid = None
-        if self.progressive_input != 'none':
-            input_pyramid = x
-        # Input layer: Conv2d: 4ch -> 128ch
-        hs = [modules[m_idx](x)]
-        m_idx += 1
-        # Down path in U-Net
-        for i_level in range(self.num_resolutions):
-            # Residual blocks for this resolution
-            for i_block in range(self.num_res_blocks):
-                h = modules[m_idx](hs[-1], temb)
-                m_idx += 1
-                # Attention layer (optional)
-                if h.shape[-2] in self.attn_resolutions: # edit: check H dim (-2) not W dim (-1)
-                    h = modules[m_idx](h)
-                    m_idx += 1
-                hs.append(h)
-            # Downsampling
-            if i_level != self.num_resolutions - 1:
-                if self.resblock_type == 'ddpm':
-                    h = modules[m_idx](hs[-1])
-                    m_idx += 1
-                else:
-                    h = modules[m_idx](hs[-1], temb)
-                    m_idx += 1
-                if self.progressive_input == 'input_skip':   # Combine h with x
-                    input_pyramid = self.pyramid_downsample(input_pyramid)
-                    h = modules[m_idx](input_pyramid, h)
-                    m_idx += 1
-                elif self.progressive_input == 'residual':
-                    input_pyramid = modules[m_idx](input_pyramid)
-                    m_idx += 1
-                    if self.skip_rescale:
-                        input_pyramid = (input_pyramid + h) / np.sqrt(2.)
-                    else:
-                        input_pyramid = input_pyramid + h
-                    h = input_pyramid
-                hs.append(h)
-        h = hs[-1] # actualy equal to: h = h
-        h = modules[m_idx](h, temb)  # ResNet block
-        m_idx += 1
-        h = modules[m_idx](h)  # Attention block
-        m_idx += 1
-        h = modules[m_idx](h, temb)  # ResNet block
-        m_idx += 1
-        pyramid = None
-        # Upsampling block
-        for i_level in reversed(range(self.num_resolutions)):
-            for i_block in range(self.num_res_blocks + 1):
-                h = modules[m_idx](torch.cat([h, hs.pop()], dim=1), temb)
-                m_idx += 1
-            # edit: from -1 to -2
-            if h.shape[-2] in self.attn_resolutions:
-                h = modules[m_idx](h)
-                m_idx += 1
-            if self.progressive != 'none':
-                if i_level == self.num_resolutions - 1:
-                    if self.progressive == 'output_skip':
-                        pyramid = self.act(modules[m_idx](h))  # GroupNorm
-                        m_idx += 1
-                        pyramid = modules[m_idx](pyramid)  # Conv2D: 256 -> 4
-                        m_idx += 1
-                    elif self.progressive == 'residual':
-                        pyramid = self.act(modules[m_idx](h))
-                        m_idx += 1
-                        pyramid = modules[m_idx](pyramid)
-                        m_idx += 1
-                    else:
-                        raise ValueError(f'{self.progressive} is not a valid name.')
-                else:
-                    if self.progressive == 'output_skip':
-                        pyramid = self.pyramid_upsample(pyramid)  # Upsample
-                        pyramid_h = self.act(modules[m_idx](h))  # GroupNorm
-                        m_idx += 1
-                        pyramid_h = modules[m_idx](pyramid_h)
-                        m_idx += 1
-                        pyramid = pyramid + pyramid_h
-                    elif self.progressive == 'residual':
-                        pyramid = modules[m_idx](pyramid)
-                        m_idx += 1
-                        if self.skip_rescale:
-                            pyramid = (pyramid + h) / np.sqrt(2.)
-                        else:
-                            pyramid = pyramid + h
-                        h = pyramid
-                    else:
-                        raise ValueError(f'{self.progressive} is not a valid name')
-            # Upsampling Layer
-            if i_level != 0:
-                if self.resblock_type == 'ddpm':
-                    h = modules[m_idx](h)
-                    m_idx += 1
-                else:
-                    h = modules[m_idx](h, temb)  # Upspampling
-                    m_idx += 1
-        assert not hs
-        if self.progressive == 'output_skip':
-            h = pyramid
-        else:
-            h = self.act(modules[m_idx](h))
-            m_idx += 1
-            h = modules[m_idx](h)
-            m_idx += 1
-        assert m_idx == len(modules), "Implementation error"
-        # Convert back to complex number
-        h = self.output_layer(h)
-        if self.scale_by_sigma:
-            used_sigmas = used_sigmas.reshape((x.shape[0], *([1] * len(x.shape[1:]))))
-            h = h / used_sigmas
-        h = torch.permute(h, (0, 2, 3, 1)).contiguous()
-        h = torch.view_as_complex(h)[:,None, :, :]
-        return h

+# coding=utf-8
+# Copyright 2020 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: skip-file
+from .ncsnpp_utils import layers, layerspp, normalization
+import torch.nn as nn
+import functools
+import torch
+import numpy as np
+from .shared import BackboneRegistry
+ResnetBlockDDPM = layerspp.ResnetBlockDDPMpp
+ResnetBlockBigGAN = layerspp.ResnetBlockBigGANpp
+Combine = layerspp.Combine
+conv3x3 = layerspp.conv3x3
+conv1x1 = layerspp.conv1x1
+get_act = layers.get_act
+get_normalization = normalization.get_normalization
+default_initializer = layers.default_init
+@BackboneRegistry.register("ncsnpp_48k")
+class NCSNpp_48k(nn.Module):
+    """NCSN++ model, adapted from https://github.com/yang-song/score_sde repository"""
+    @staticmethod
+    def add_argparse_args(parser):
+        parser.add_argument("--ch_mult",type=int, nargs='+', default=[1,1,2,2,2,2,2])
+        parser.add_argument("--num_res_blocks", type=int, default=2)
+        parser.add_argument("--attn_resolutions", type=int, nargs='+', default=[])
+        parser.add_argument("--nf", type=int, default=128, help="Number of channels to use in the model")
+        parser.add_argument("--no-centered", dest="centered", action="store_false", help="The data is not centered [-1, 1]")
+        parser.add_argument("--centered", dest="centered", action="store_true", help="The data is centered [-1, 1]")
+        parser.add_argument("--progressive", type=str, default='none', help="Progressive downsampling method")
+        parser.add_argument("--progressive_input", type=str, default='none', help="Progressive upsampling method")
+        parser.set_defaults(centered=True)
+        return parser
+    def __init__(self,
+        scale_by_sigma = True,
+        nonlinearity = 'swish',
+        nf = 128,
+        ch_mult = (1, 1, 2, 2, 2, 2, 2),
+        num_res_blocks = 2,
+        attn_resolutions = (),
+        resamp_with_conv = True,
+        conditional = True,
+        fir = True,
+        fir_kernel = [1, 3, 3, 1],
+        skip_rescale = True,
+        resblock_type = 'biggan',
+        progressive = 'none',
+        progressive_input = 'none',
+        progressive_combine = 'sum',
+        init_scale = 0.,
+        fourier_scale = 16,
+        image_size = 256,
+        embedding_type = 'fourier',
+        dropout = .0,
+        centered = True,
+        **unused_kwargs
+    ):
+        super().__init__()
+        self.act = act = get_act(nonlinearity)
+        self.nf = nf = nf
+        ch_mult = ch_mult
+        self.num_res_blocks = num_res_blocks = num_res_blocks
+        self.attn_resolutions = attn_resolutions
+        dropout = dropout
+        resamp_with_conv = resamp_with_conv
+        self.num_resolutions = num_resolutions = len(ch_mult)
+        self.all_resolutions = all_resolutions = [image_size // (2 ** i) for i in range(num_resolutions)]
+        self.conditional = conditional = conditional  # noise-conditional
+        self.centered = centered
+        self.scale_by_sigma = scale_by_sigma
+        fir = fir
+        fir_kernel = fir_kernel
+        self.skip_rescale = skip_rescale = skip_rescale
+        self.resblock_type = resblock_type = resblock_type.lower()
+        self.progressive = progressive = progressive.lower()
+        self.progressive_input = progressive_input = progressive_input.lower()
+        self.embedding_type = embedding_type = embedding_type.lower()
+        init_scale = init_scale
+        assert progressive in ['none', 'output_skip', 'residual']
+        assert progressive_input in ['none', 'input_skip', 'residual']
+        assert embedding_type in ['fourier', 'positional']
+        combine_method = progressive_combine.lower()
+        combiner = functools.partial(Combine, method=combine_method)
+        num_channels = 4  # x.real, x.imag, y.real, y.imag
+        self.output_layer = nn.Conv2d(num_channels, 2, 1)
+        modules = []
+        # timestep/noise_level embedding
+        if embedding_type == 'fourier':
+            # Gaussian Fourier features embeddings.
+            modules.append(layerspp.GaussianFourierProjection(
+                embedding_size=nf, scale=fourier_scale
+            ))
+            embed_dim = 2 * nf
+        elif embedding_type == 'positional':
+            embed_dim = nf
+        else:
+            raise ValueError(f'embedding type {embedding_type} unknown.')
+        if conditional:
+            modules.append(nn.Linear(embed_dim, nf * 4))
+            modules[-1].weight.data = default_initializer()(modules[-1].weight.shape)
+            nn.init.zeros_(modules[-1].bias)
+            modules.append(nn.Linear(nf * 4, nf * 4))
+            modules[-1].weight.data = default_initializer()(modules[-1].weight.shape)
+            nn.init.zeros_(modules[-1].bias)
+        AttnBlock = functools.partial(layerspp.AttnBlockpp,
+            init_scale=init_scale, skip_rescale=skip_rescale)
+        Upsample = functools.partial(layerspp.Upsample,
+            with_conv=resamp_with_conv, fir=fir, fir_kernel=fir_kernel)
+        if progressive == 'output_skip':
+            self.pyramid_upsample = layerspp.Upsample(fir=fir, fir_kernel=fir_kernel, with_conv=False)
+        elif progressive == 'residual':
+            pyramid_upsample = functools.partial(layerspp.Upsample, fir=fir,
+                fir_kernel=fir_kernel, with_conv=True)
+        Downsample = functools.partial(layerspp.Downsample, with_conv=resamp_with_conv, fir=fir, fir_kernel=fir_kernel)
+        if progressive_input == 'input_skip':
+            self.pyramid_downsample = layerspp.Downsample(fir=fir, fir_kernel=fir_kernel, with_conv=False)
+        elif progressive_input == 'residual':
+            pyramid_downsample = functools.partial(layerspp.Downsample,
+                fir=fir, fir_kernel=fir_kernel, with_conv=True)
+        if resblock_type == 'ddpm':
+            ResnetBlock = functools.partial(ResnetBlockDDPM, act=act,
+                dropout=dropout, init_scale=init_scale,
+                skip_rescale=skip_rescale, temb_dim=nf * 4)
+        elif resblock_type == 'biggan':
+            ResnetBlock = functools.partial(ResnetBlockBigGAN, act=act,
+                dropout=dropout, fir=fir, fir_kernel=fir_kernel,
+                init_scale=init_scale, skip_rescale=skip_rescale, temb_dim=nf * 4)
+        else:
+            raise ValueError(f'resblock type {resblock_type} unrecognized.')
+        # Downsampling block
+        channels = num_channels
+        if progressive_input != 'none':
+            input_pyramid_ch = channels
+        modules.append(conv3x3(channels, nf))
+        hs_c = [nf]
+        in_ch = nf
+        for i_level in range(num_resolutions):
+            # Residual blocks for this resolution
+            for i_block in range(num_res_blocks):
+                out_ch = nf * ch_mult[i_level]
+                modules.append(ResnetBlock(in_ch=in_ch, out_ch=out_ch))
+                in_ch = out_ch
+                if all_resolutions[i_level] in attn_resolutions:
+                    modules.append(AttnBlock(channels=in_ch))
+                hs_c.append(in_ch)
+            if i_level != num_resolutions - 1:
+                if resblock_type == 'ddpm':
+                    modules.append(Downsample(in_ch=in_ch))
+                else:
+                    modules.append(ResnetBlock(down=True, in_ch=in_ch))
+                if progressive_input == 'input_skip':
+                    modules.append(combiner(dim1=input_pyramid_ch, dim2=in_ch))
+                    if combine_method == 'cat':
+                        in_ch *= 2
+                elif progressive_input == 'residual':
+                    modules.append(pyramid_downsample(in_ch=input_pyramid_ch, out_ch=in_ch))
+                    input_pyramid_ch = in_ch
+                hs_c.append(in_ch)
+        in_ch = hs_c[-1]
+        modules.append(ResnetBlock(in_ch=in_ch))
+        modules.append(AttnBlock(channels=in_ch))
+        modules.append(ResnetBlock(in_ch=in_ch))
+        pyramid_ch = 0
+        # Upsampling block
+        for i_level in reversed(range(num_resolutions)):
+            for i_block in range(num_res_blocks + 1):  # +1 blocks in upsampling because of skip connection from combiner (after downsampling)
+                out_ch = nf * ch_mult[i_level]
+                modules.append(ResnetBlock(in_ch=in_ch + hs_c.pop(), out_ch=out_ch))
+                in_ch = out_ch
+            if all_resolutions[i_level] in attn_resolutions:
+                modules.append(AttnBlock(channels=in_ch))
+            if progressive != 'none':
+                if i_level == num_resolutions - 1:
+                    if progressive == 'output_skip':
+                        modules.append(nn.GroupNorm(num_groups=min(in_ch // 4, 32),
+                            num_channels=in_ch, eps=1e-6))
+                        modules.append(conv3x3(in_ch, channels, init_scale=init_scale))
+                        pyramid_ch = channels
+                    elif progressive == 'residual':
+                        modules.append(nn.GroupNorm(num_groups=min(in_ch // 4, 32), num_channels=in_ch, eps=1e-6))
+                        modules.append(conv3x3(in_ch, in_ch, bias=True))
+                        pyramid_ch = in_ch
+                    else:
+                        raise ValueError(f'{progressive} is not a valid name.')
+                else:
+                    if progressive == 'output_skip':
+                        modules.append(nn.GroupNorm(num_groups=min(in_ch // 4, 32),
+                            num_channels=in_ch, eps=1e-6))
+                        modules.append(conv3x3(in_ch, channels, bias=True, init_scale=init_scale))
+                        pyramid_ch = channels
+                    elif progressive == 'residual':
+                        modules.append(pyramid_upsample(in_ch=pyramid_ch, out_ch=in_ch))
+                        pyramid_ch = in_ch
+                    else:
+                        raise ValueError(f'{progressive} is not a valid name')
+            if i_level != 0:
+                if resblock_type == 'ddpm':
+                    modules.append(Upsample(in_ch=in_ch))
+                else:
+                    modules.append(ResnetBlock(in_ch=in_ch, up=True))
+        assert not hs_c
+        if progressive != 'output_skip':
+            modules.append(nn.GroupNorm(num_groups=min(in_ch // 4, 32),
+                                                                    num_channels=in_ch, eps=1e-6))
+            modules.append(conv3x3(in_ch, channels, init_scale=init_scale))
+        self.all_modules = nn.ModuleList(modules)
+    def forward(self, x, time_cond):
+        # timestep/noise_level embedding; only for continuous training
+        modules = self.all_modules
+        m_idx = 0
+        # Convert real and imaginary parts of (x,y) into four channel dimensions
+        x = torch.cat((x[:,[0],:,:].real, x[:,[0],:,:].imag,
+                x[:,[1],:,:].real, x[:,[1],:,:].imag), dim=1)
+        if self.embedding_type == 'fourier':
+            # Gaussian Fourier features embeddings.
+            used_sigmas = time_cond
+            temb = modules[m_idx](torch.log(used_sigmas))
+            m_idx += 1
+        elif self.embedding_type == 'positional':
+            # Sinusoidal positional embeddings.
+            timesteps = time_cond
+            used_sigmas = self.sigmas[time_cond.long()]
+            temb = layers.get_timestep_embedding(timesteps, self.nf)
+        else:
+            raise ValueError(f'embedding type {self.embedding_type} unknown.')
+        if self.conditional:
+            temb = modules[m_idx](temb)
+            m_idx += 1
+            temb = modules[m_idx](self.act(temb))
+            m_idx += 1
+        else:
+            temb = None
+        if not self.centered:
+            # If input data is in [0, 1]
+            x = 2 * x - 1.
+        # Downsampling block
+        input_pyramid = None
+        if self.progressive_input != 'none':
+            input_pyramid = x
+        # Input layer: Conv2d: 4ch -> 128ch
+        hs = [modules[m_idx](x)]
+        m_idx += 1
+        # Down path in U-Net
+        for i_level in range(self.num_resolutions):
+            # Residual blocks for this resolution
+            for i_block in range(self.num_res_blocks):
+                h = modules[m_idx](hs[-1], temb)
+                m_idx += 1
+                # Attention layer (optional)
+                if h.shape[-2] in self.attn_resolutions: # edit: check H dim (-2) not W dim (-1)
+                    h = modules[m_idx](h)
+                    m_idx += 1
+                hs.append(h)
+            # Downsampling
+            if i_level != self.num_resolutions - 1:
+                if self.resblock_type == 'ddpm':
+                    h = modules[m_idx](hs[-1])
+                    m_idx += 1
+                else:
+                    h = modules[m_idx](hs[-1], temb)
+                    m_idx += 1
+                if self.progressive_input == 'input_skip':   # Combine h with x
+                    input_pyramid = self.pyramid_downsample(input_pyramid)
+                    h = modules[m_idx](input_pyramid, h)
+                    m_idx += 1
+                elif self.progressive_input == 'residual':
+                    input_pyramid = modules[m_idx](input_pyramid)
+                    m_idx += 1
+                    if self.skip_rescale:
+                        input_pyramid = (input_pyramid + h) / np.sqrt(2.)
+                    else:
+                        input_pyramid = input_pyramid + h
+                    h = input_pyramid
+                hs.append(h)
+        h = hs[-1] # actualy equal to: h = h
+        h = modules[m_idx](h, temb)  # ResNet block
+        m_idx += 1
+        h = modules[m_idx](h)  # Attention block
+        m_idx += 1
+        h = modules[m_idx](h, temb)  # ResNet block
+        m_idx += 1
+        pyramid = None
+        # Upsampling block
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks + 1):
+                h = modules[m_idx](torch.cat([h, hs.pop()], dim=1), temb)
+                m_idx += 1
+            # edit: from -1 to -2
+            if h.shape[-2] in self.attn_resolutions:
+                h = modules[m_idx](h)
+                m_idx += 1
+            if self.progressive != 'none':
+                if i_level == self.num_resolutions - 1:
+                    if self.progressive == 'output_skip':
+                        pyramid = self.act(modules[m_idx](h))  # GroupNorm
+                        m_idx += 1
+                        pyramid = modules[m_idx](pyramid)  # Conv2D: 256 -> 4
+                        m_idx += 1
+                    elif self.progressive == 'residual':
+                        pyramid = self.act(modules[m_idx](h))
+                        m_idx += 1
+                        pyramid = modules[m_idx](pyramid)
+                        m_idx += 1
+                    else:
+                        raise ValueError(f'{self.progressive} is not a valid name.')
+                else:
+                    if self.progressive == 'output_skip':
+                        pyramid = self.pyramid_upsample(pyramid)  # Upsample
+                        pyramid_h = self.act(modules[m_idx](h))  # GroupNorm
+                        m_idx += 1
+                        pyramid_h = modules[m_idx](pyramid_h)
+                        m_idx += 1
+                        pyramid = pyramid + pyramid_h
+                    elif self.progressive == 'residual':
+                        pyramid = modules[m_idx](pyramid)
+                        m_idx += 1
+                        if self.skip_rescale:
+                            pyramid = (pyramid + h) / np.sqrt(2.)
+                        else:
+                            pyramid = pyramid + h
+                        h = pyramid
+                    else:
+                        raise ValueError(f'{self.progressive} is not a valid name')
+            # Upsampling Layer
+            if i_level != 0:
+                if self.resblock_type == 'ddpm':
+                    h = modules[m_idx](h)
+                    m_idx += 1
+                else:
+                    h = modules[m_idx](h, temb)  # Upspampling
+                    m_idx += 1
+        assert not hs
+        if self.progressive == 'output_skip':
+            h = pyramid
+        else:
+            h = self.act(modules[m_idx](h))
+            m_idx += 1
+            h = modules[m_idx](h)
+            m_idx += 1
+        assert m_idx == len(modules), "Implementation error"
+        # Convert back to complex number
+        h = self.output_layer(h)
+        if self.scale_by_sigma:
+            used_sigmas = used_sigmas.reshape((x.shape[0], *([1] * len(x.shape[1:]))))
+            h = h / used_sigmas
+        h = torch.permute(h, (0, 2, 3, 1)).contiguous()
+        h = torch.view_as_complex(h)[:,None, :, :]
+        return h

sgmse/backbones/ncsnpp_utils/layers.py CHANGED Viewed

@@ -1,662 +1,662 @@
-# coding=utf-8
-# Copyright 2020 The Google Research Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# pylint: skip-file
-"""Common layers for defining score networks.
-"""
-import math
-import string
-from functools import partial
-import torch.nn as nn
-import torch
-import torch.nn.functional as F
-import numpy as np
-from .normalization import ConditionalInstanceNorm2dPlus
-def get_act(config):
-  """Get activation functions from the config file."""
-  if config == 'elu':
-    return nn.ELU()
-  elif config == 'relu':
-    return nn.ReLU()
-  elif config == 'lrelu':
-    return nn.LeakyReLU(negative_slope=0.2)
-  elif config == 'swish':
-    return nn.SiLU()
-  else:
-    raise NotImplementedError('activation function does not exist!')
-def ncsn_conv1x1(in_planes, out_planes, stride=1, bias=True, dilation=1, init_scale=1., padding=0):
-  """1x1 convolution. Same as NCSNv1/v2."""
-  conv = nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=bias, dilation=dilation,
-                   padding=padding)
-  init_scale = 1e-10 if init_scale == 0 else init_scale
-  conv.weight.data *= init_scale
-  conv.bias.data *= init_scale
-  return conv
-def variance_scaling(scale, mode, distribution,
-                     in_axis=1, out_axis=0,
-                     dtype=torch.float32,
-                     device='cpu'):
-  """Ported from JAX. """
-  def _compute_fans(shape, in_axis=1, out_axis=0):
-    receptive_field_size = np.prod(shape) / shape[in_axis] / shape[out_axis]
-    fan_in = shape[in_axis] * receptive_field_size
-    fan_out = shape[out_axis] * receptive_field_size
-    return fan_in, fan_out
-  def init(shape, dtype=dtype, device=device):
-    fan_in, fan_out = _compute_fans(shape, in_axis, out_axis)
-    if mode == "fan_in":
-      denominator = fan_in
-    elif mode == "fan_out":
-      denominator = fan_out
-    elif mode == "fan_avg":
-      denominator = (fan_in + fan_out) / 2
-    else:
-      raise ValueError(
-        "invalid mode for variance scaling initializer: {}".format(mode))
-    variance = scale / denominator
-    if distribution == "normal":
-      return torch.randn(*shape, dtype=dtype, device=device) * np.sqrt(variance)
-    elif distribution == "uniform":
-      return (torch.rand(*shape, dtype=dtype, device=device) * 2. - 1.) * np.sqrt(3 * variance)
-    else:
-      raise ValueError("invalid distribution for variance scaling initializer")
-  return init
-def default_init(scale=1.):
-  """The same initialization used in DDPM."""
-  scale = 1e-10 if scale == 0 else scale
-  return variance_scaling(scale, 'fan_avg', 'uniform')
-class Dense(nn.Module):
-  """Linear layer with `default_init`."""
-  def __init__(self):
-    super().__init__()
-def ddpm_conv1x1(in_planes, out_planes, stride=1, bias=True, init_scale=1., padding=0):
-  """1x1 convolution with DDPM initialization."""
-  conv = nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, padding=padding, bias=bias)
-  conv.weight.data = default_init(init_scale)(conv.weight.data.shape)
-  nn.init.zeros_(conv.bias)
-  return conv
-def ncsn_conv3x3(in_planes, out_planes, stride=1, bias=True, dilation=1, init_scale=1., padding=1):
-  """3x3 convolution with PyTorch initialization. Same as NCSNv1/NCSNv2."""
-  init_scale = 1e-10 if init_scale == 0 else init_scale
-  conv = nn.Conv2d(in_planes, out_planes, stride=stride, bias=bias,
-                   dilation=dilation, padding=padding, kernel_size=3)
-  conv.weight.data *= init_scale
-  conv.bias.data *= init_scale
-  return conv
-def ddpm_conv3x3(in_planes, out_planes, stride=1, bias=True, dilation=1, init_scale=1., padding=1):
-  """3x3 convolution with DDPM initialization."""
-  conv = nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=padding,
-                   dilation=dilation, bias=bias)
-  conv.weight.data = default_init(init_scale)(conv.weight.data.shape)
-  nn.init.zeros_(conv.bias)
-  return conv
-  ###########################################################################
-  # Functions below are ported over from the NCSNv1/NCSNv2 codebase:
-  # https://github.com/ermongroup/ncsn
-  # https://github.com/ermongroup/ncsnv2
-  ###########################################################################
-class CRPBlock(nn.Module):
-  def __init__(self, features, n_stages, act=nn.ReLU(), maxpool=True):
-    super().__init__()
-    self.convs = nn.ModuleList()
-    for i in range(n_stages):
-      self.convs.append(ncsn_conv3x3(features, features, stride=1, bias=False))
-    self.n_stages = n_stages
-    if maxpool:
-      self.pool = nn.MaxPool2d(kernel_size=5, stride=1, padding=2)
-    else:
-      self.pool = nn.AvgPool2d(kernel_size=5, stride=1, padding=2)
-    self.act = act
-  def forward(self, x):
-    x = self.act(x)
-    path = x
-    for i in range(self.n_stages):
-      path = self.pool(path)
-      path = self.convs[i](path)
-      x = path + x
-    return x
-class CondCRPBlock(nn.Module):
-  def __init__(self, features, n_stages, num_classes, normalizer, act=nn.ReLU()):
-    super().__init__()
-    self.convs = nn.ModuleList()
-    self.norms = nn.ModuleList()
-    self.normalizer = normalizer
-    for i in range(n_stages):
-      self.norms.append(normalizer(features, num_classes, bias=True))
-      self.convs.append(ncsn_conv3x3(features, features, stride=1, bias=False))
-    self.n_stages = n_stages
-    self.pool = nn.AvgPool2d(kernel_size=5, stride=1, padding=2)
-    self.act = act
-  def forward(self, x, y):
-    x = self.act(x)
-    path = x
-    for i in range(self.n_stages):
-      path = self.norms[i](path, y)
-      path = self.pool(path)
-      path = self.convs[i](path)
-      x = path + x
-    return x
-class RCUBlock(nn.Module):
-  def __init__(self, features, n_blocks, n_stages, act=nn.ReLU()):
-    super().__init__()
-    for i in range(n_blocks):
-      for j in range(n_stages):
-        setattr(self, '{}_{}_conv'.format(i + 1, j + 1), ncsn_conv3x3(features, features, stride=1, bias=False))
-    self.stride = 1
-    self.n_blocks = n_blocks
-    self.n_stages = n_stages
-    self.act = act
-  def forward(self, x):
-    for i in range(self.n_blocks):
-      residual = x
-      for j in range(self.n_stages):
-        x = self.act(x)
-        x = getattr(self, '{}_{}_conv'.format(i + 1, j + 1))(x)
-      x += residual
-    return x
-class CondRCUBlock(nn.Module):
-  def __init__(self, features, n_blocks, n_stages, num_classes, normalizer, act=nn.ReLU()):
-    super().__init__()
-    for i in range(n_blocks):
-      for j in range(n_stages):
-        setattr(self, '{}_{}_norm'.format(i + 1, j + 1), normalizer(features, num_classes, bias=True))
-        setattr(self, '{}_{}_conv'.format(i + 1, j + 1), ncsn_conv3x3(features, features, stride=1, bias=False))
-    self.stride = 1
-    self.n_blocks = n_blocks
-    self.n_stages = n_stages
-    self.act = act
-    self.normalizer = normalizer
-  def forward(self, x, y):
-    for i in range(self.n_blocks):
-      residual = x
-      for j in range(self.n_stages):
-        x = getattr(self, '{}_{}_norm'.format(i + 1, j + 1))(x, y)
-        x = self.act(x)
-        x = getattr(self, '{}_{}_conv'.format(i + 1, j + 1))(x)
-      x += residual
-    return x
-class MSFBlock(nn.Module):
-  def __init__(self, in_planes, features):
-    super().__init__()
-    assert isinstance(in_planes, list) or isinstance(in_planes, tuple)
-    self.convs = nn.ModuleList()
-    self.features = features
-    for i in range(len(in_planes)):
-      self.convs.append(ncsn_conv3x3(in_planes[i], features, stride=1, bias=True))
-  def forward(self, xs, shape):
-    sums = torch.zeros(xs[0].shape[0], self.features, *shape, device=xs[0].device)
-    for i in range(len(self.convs)):
-      h = self.convs[i](xs[i])
-      h = F.interpolate(h, size=shape, mode='bilinear', align_corners=True)
-      sums += h
-    return sums
-class CondMSFBlock(nn.Module):
-  def __init__(self, in_planes, features, num_classes, normalizer):
-    super().__init__()
-    assert isinstance(in_planes, list) or isinstance(in_planes, tuple)
-    self.convs = nn.ModuleList()
-    self.norms = nn.ModuleList()
-    self.features = features
-    self.normalizer = normalizer
-    for i in range(len(in_planes)):
-      self.convs.append(ncsn_conv3x3(in_planes[i], features, stride=1, bias=True))
-      self.norms.append(normalizer(in_planes[i], num_classes, bias=True))
-  def forward(self, xs, y, shape):
-    sums = torch.zeros(xs[0].shape[0], self.features, *shape, device=xs[0].device)
-    for i in range(len(self.convs)):
-      h = self.norms[i](xs[i], y)
-      h = self.convs[i](h)
-      h = F.interpolate(h, size=shape, mode='bilinear', align_corners=True)
-      sums += h
-    return sums
-class RefineBlock(nn.Module):
-  def __init__(self, in_planes, features, act=nn.ReLU(), start=False, end=False, maxpool=True):
-    super().__init__()
-    assert isinstance(in_planes, tuple) or isinstance(in_planes, list)
-    self.n_blocks = n_blocks = len(in_planes)
-    self.adapt_convs = nn.ModuleList()
-    for i in range(n_blocks):
-      self.adapt_convs.append(RCUBlock(in_planes[i], 2, 2, act))
-    self.output_convs = RCUBlock(features, 3 if end else 1, 2, act)
-    if not start:
-      self.msf = MSFBlock(in_planes, features)
-    self.crp = CRPBlock(features, 2, act, maxpool=maxpool)
-  def forward(self, xs, output_shape):
-    assert isinstance(xs, tuple) or isinstance(xs, list)
-    hs = []
-    for i in range(len(xs)):
-      h = self.adapt_convs[i](xs[i])
-      hs.append(h)
-    if self.n_blocks > 1:
-      h = self.msf(hs, output_shape)
-    else:
-      h = hs[0]
-    h = self.crp(h)
-    h = self.output_convs(h)
-    return h
-class CondRefineBlock(nn.Module):
-  def __init__(self, in_planes, features, num_classes, normalizer, act=nn.ReLU(), start=False, end=False):
-    super().__init__()
-    assert isinstance(in_planes, tuple) or isinstance(in_planes, list)
-    self.n_blocks = n_blocks = len(in_planes)
-    self.adapt_convs = nn.ModuleList()
-    for i in range(n_blocks):
-      self.adapt_convs.append(
-        CondRCUBlock(in_planes[i], 2, 2, num_classes, normalizer, act)
-      )
-    self.output_convs = CondRCUBlock(features, 3 if end else 1, 2, num_classes, normalizer, act)
-    if not start:
-      self.msf = CondMSFBlock(in_planes, features, num_classes, normalizer)
-    self.crp = CondCRPBlock(features, 2, num_classes, normalizer, act)
-  def forward(self, xs, y, output_shape):
-    assert isinstance(xs, tuple) or isinstance(xs, list)
-    hs = []
-    for i in range(len(xs)):
-      h = self.adapt_convs[i](xs[i], y)
-      hs.append(h)
-    if self.n_blocks > 1:
-      h = self.msf(hs, y, output_shape)
-    else:
-      h = hs[0]
-    h = self.crp(h, y)
-    h = self.output_convs(h, y)
-    return h
-class ConvMeanPool(nn.Module):
-  def __init__(self, input_dim, output_dim, kernel_size=3, biases=True, adjust_padding=False):
-    super().__init__()
-    if not adjust_padding:
-      conv = nn.Conv2d(input_dim, output_dim, kernel_size, stride=1, padding=kernel_size // 2, bias=biases)
-      self.conv = conv
-    else:
-      conv = nn.Conv2d(input_dim, output_dim, kernel_size, stride=1, padding=kernel_size // 2, bias=biases)
-      self.conv = nn.Sequential(
-        nn.ZeroPad2d((1, 0, 1, 0)),
-        conv
-      )
-  def forward(self, inputs):
-    output = self.conv(inputs)
-    output = sum([output[:, :, ::2, ::2], output[:, :, 1::2, ::2],
-                  output[:, :, ::2, 1::2], output[:, :, 1::2, 1::2]]) / 4.
-    return output
-class MeanPoolConv(nn.Module):
-  def __init__(self, input_dim, output_dim, kernel_size=3, biases=True):
-    super().__init__()
-    self.conv = nn.Conv2d(input_dim, output_dim, kernel_size, stride=1, padding=kernel_size // 2, bias=biases)
-  def forward(self, inputs):
-    output = inputs
-    output = sum([output[:, :, ::2, ::2], output[:, :, 1::2, ::2],
-                  output[:, :, ::2, 1::2], output[:, :, 1::2, 1::2]]) / 4.
-    return self.conv(output)
-class UpsampleConv(nn.Module):
-  def __init__(self, input_dim, output_dim, kernel_size=3, biases=True):
-    super().__init__()
-    self.conv = nn.Conv2d(input_dim, output_dim, kernel_size, stride=1, padding=kernel_size // 2, bias=biases)
-    self.pixelshuffle = nn.PixelShuffle(upscale_factor=2)
-  def forward(self, inputs):
-    output = inputs
-    output = torch.cat([output, output, output, output], dim=1)
-    output = self.pixelshuffle(output)
-    return self.conv(output)
-class ConditionalResidualBlock(nn.Module):
-  def __init__(self, input_dim, output_dim, num_classes, resample=1, act=nn.ELU(),
-               normalization=ConditionalInstanceNorm2dPlus, adjust_padding=False, dilation=None):
-    super().__init__()
-    self.non_linearity = act
-    self.input_dim = input_dim
-    self.output_dim = output_dim
-    self.resample = resample
-    self.normalization = normalization
-    if resample == 'down':
-      if dilation > 1:
-        self.conv1 = ncsn_conv3x3(input_dim, input_dim, dilation=dilation)
-        self.normalize2 = normalization(input_dim, num_classes)
-        self.conv2 = ncsn_conv3x3(input_dim, output_dim, dilation=dilation)
-        conv_shortcut = partial(ncsn_conv3x3, dilation=dilation)
-      else:
-        self.conv1 = ncsn_conv3x3(input_dim, input_dim)
-        self.normalize2 = normalization(input_dim, num_classes)
-        self.conv2 = ConvMeanPool(input_dim, output_dim, 3, adjust_padding=adjust_padding)
-        conv_shortcut = partial(ConvMeanPool, kernel_size=1, adjust_padding=adjust_padding)
-    elif resample is None:
-      if dilation > 1:
-        conv_shortcut = partial(ncsn_conv3x3, dilation=dilation)
-        self.conv1 = ncsn_conv3x3(input_dim, output_dim, dilation=dilation)
-        self.normalize2 = normalization(output_dim, num_classes)
-        self.conv2 = ncsn_conv3x3(output_dim, output_dim, dilation=dilation)
-      else:
-        conv_shortcut = nn.Conv2d
-        self.conv1 = ncsn_conv3x3(input_dim, output_dim)
-        self.normalize2 = normalization(output_dim, num_classes)
-        self.conv2 = ncsn_conv3x3(output_dim, output_dim)
-    else:
-      raise Exception('invalid resample value')
-    if output_dim != input_dim or resample is not None:
-      self.shortcut = conv_shortcut(input_dim, output_dim)
-    self.normalize1 = normalization(input_dim, num_classes)
-  def forward(self, x, y):
-    output = self.normalize1(x, y)
-    output = self.non_linearity(output)
-    output = self.conv1(output)
-    output = self.normalize2(output, y)
-    output = self.non_linearity(output)
-    output = self.conv2(output)
-    if self.output_dim == self.input_dim and self.resample is None:
-      shortcut = x
-    else:
-      shortcut = self.shortcut(x)
-    return shortcut + output
-class ResidualBlock(nn.Module):
-  def __init__(self, input_dim, output_dim, resample=None, act=nn.ELU(),
-               normalization=nn.InstanceNorm2d, adjust_padding=False, dilation=1):
-    super().__init__()
-    self.non_linearity = act
-    self.input_dim = input_dim
-    self.output_dim = output_dim
-    self.resample = resample
-    self.normalization = normalization
-    if resample == 'down':
-      if dilation > 1:
-        self.conv1 = ncsn_conv3x3(input_dim, input_dim, dilation=dilation)
-        self.normalize2 = normalization(input_dim)
-        self.conv2 = ncsn_conv3x3(input_dim, output_dim, dilation=dilation)
-        conv_shortcut = partial(ncsn_conv3x3, dilation=dilation)
-      else:
-        self.conv1 = ncsn_conv3x3(input_dim, input_dim)
-        self.normalize2 = normalization(input_dim)
-        self.conv2 = ConvMeanPool(input_dim, output_dim, 3, adjust_padding=adjust_padding)
-        conv_shortcut = partial(ConvMeanPool, kernel_size=1, adjust_padding=adjust_padding)
-    elif resample is None:
-      if dilation > 1:
-        conv_shortcut = partial(ncsn_conv3x3, dilation=dilation)
-        self.conv1 = ncsn_conv3x3(input_dim, output_dim, dilation=dilation)
-        self.normalize2 = normalization(output_dim)
-        self.conv2 = ncsn_conv3x3(output_dim, output_dim, dilation=dilation)
-      else:
-        # conv_shortcut = nn.Conv2d ### Something wierd here.
-        conv_shortcut = partial(ncsn_conv1x1)
-        self.conv1 = ncsn_conv3x3(input_dim, output_dim)
-        self.normalize2 = normalization(output_dim)
-        self.conv2 = ncsn_conv3x3(output_dim, output_dim)
-    else:
-      raise Exception('invalid resample value')
-    if output_dim != input_dim or resample is not None:
-      self.shortcut = conv_shortcut(input_dim, output_dim)
-    self.normalize1 = normalization(input_dim)
-  def forward(self, x):
-    output = self.normalize1(x)
-    output = self.non_linearity(output)
-    output = self.conv1(output)
-    output = self.normalize2(output)
-    output = self.non_linearity(output)
-    output = self.conv2(output)
-    if self.output_dim == self.input_dim and self.resample is None:
-      shortcut = x
-    else:
-      shortcut = self.shortcut(x)
-    return shortcut + output
-###########################################################################
-# Functions below are ported over from the DDPM codebase:
-#  https://github.com/hojonathanho/diffusion/blob/master/diffusion_tf/nn.py
-###########################################################################
-def get_timestep_embedding(timesteps, embedding_dim, max_positions=10000):
-  assert len(timesteps.shape) == 1  # and timesteps.dtype == tf.int32
-  half_dim = embedding_dim // 2
-  # magic number 10000 is from transformers
-  emb = math.log(max_positions) / (half_dim - 1)
-  # emb = math.log(2.) / (half_dim - 1)
-  emb = torch.exp(torch.arange(half_dim, dtype=torch.float32, device=timesteps.device) * -emb)
-  # emb = tf.range(num_embeddings, dtype=jnp.float32)[:, None] * emb[None, :]
-  # emb = tf.cast(timesteps, dtype=jnp.float32)[:, None] * emb[None, :]
-  emb = timesteps.float()[:, None] * emb[None, :]
-  emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
-  if embedding_dim % 2 == 1:  # zero pad
-    emb = F.pad(emb, (0, 1), mode='constant')
-  assert emb.shape == (timesteps.shape[0], embedding_dim)
-  return emb
-def _einsum(a, b, c, x, y):
-  einsum_str = '{},{}->{}'.format(''.join(a), ''.join(b), ''.join(c))
-  return torch.einsum(einsum_str, x, y)
-def contract_inner(x, y):
-  """tensordot(x, y, 1)."""
-  x_chars = list(string.ascii_lowercase[:len(x.shape)])
-  y_chars = list(string.ascii_lowercase[len(x.shape):len(y.shape) + len(x.shape)])
-  y_chars[0] = x_chars[-1]  # first axis of y and last of x get summed
-  out_chars = x_chars[:-1] + y_chars[1:]
-  return _einsum(x_chars, y_chars, out_chars, x, y)
-class NIN(nn.Module):
-  def __init__(self, in_dim, num_units, init_scale=0.1):
-    super().__init__()
-    self.W = nn.Parameter(default_init(scale=init_scale)((in_dim, num_units)), requires_grad=True)
-    self.b = nn.Parameter(torch.zeros(num_units), requires_grad=True)
-  def forward(self, x):
-    x = x.permute(0, 2, 3, 1)
-    y = contract_inner(x, self.W) + self.b
-    return y.permute(0, 3, 1, 2)
-class AttnBlock(nn.Module):
-  """Channel-wise self-attention block."""
-  def __init__(self, channels):
-    super().__init__()
-    self.GroupNorm_0 = nn.GroupNorm(num_groups=32, num_channels=channels, eps=1e-6)
-    self.NIN_0 = NIN(channels, channels)
-    self.NIN_1 = NIN(channels, channels)
-    self.NIN_2 = NIN(channels, channels)
-    self.NIN_3 = NIN(channels, channels, init_scale=0.)
-  def forward(self, x):
-    B, C, H, W = x.shape
-    h = self.GroupNorm_0(x)
-    q = self.NIN_0(h)
-    k = self.NIN_1(h)
-    v = self.NIN_2(h)
-    w = torch.einsum('bchw,bcij->bhwij', q, k) * (int(C) ** (-0.5))
-    w = torch.reshape(w, (B, H, W, H * W))
-    w = F.softmax(w, dim=-1)
-    w = torch.reshape(w, (B, H, W, H, W))
-    h = torch.einsum('bhwij,bcij->bchw', w, v)
-    h = self.NIN_3(h)
-    return x + h
-class Upsample(nn.Module):
-  def __init__(self, channels, with_conv=False):
-    super().__init__()
-    if with_conv:
-      self.Conv_0 = ddpm_conv3x3(channels, channels)
-    self.with_conv = with_conv
-  def forward(self, x):
-    B, C, H, W = x.shape
-    h = F.interpolate(x, (H * 2, W * 2), mode='nearest')
-    if self.with_conv:
-      h = self.Conv_0(h)
-    return h
-class Downsample(nn.Module):
-  def __init__(self, channels, with_conv=False):
-    super().__init__()
-    if with_conv:
-      self.Conv_0 = ddpm_conv3x3(channels, channels, stride=2, padding=0)
-    self.with_conv = with_conv
-  def forward(self, x):
-    B, C, H, W = x.shape
-    # Emulate 'SAME' padding
-    if self.with_conv:
-      x = F.pad(x, (0, 1, 0, 1))
-      x = self.Conv_0(x)
-    else:
-      x = F.avg_pool2d(x, kernel_size=2, stride=2, padding=0)
-    assert x.shape == (B, C, H // 2, W // 2)
-    return x
-class ResnetBlockDDPM(nn.Module):
-  """The ResNet Blocks used in DDPM."""
-  def __init__(self, act, in_ch, out_ch=None, temb_dim=None, conv_shortcut=False, dropout=0.1):
-    super().__init__()
-    if out_ch is None:
-      out_ch = in_ch
-    self.GroupNorm_0 = nn.GroupNorm(num_groups=32, num_channels=in_ch, eps=1e-6)
-    self.act = act
-    self.Conv_0 = ddpm_conv3x3(in_ch, out_ch)
-    if temb_dim is not None:
-      self.Dense_0 = nn.Linear(temb_dim, out_ch)
-      self.Dense_0.weight.data = default_init()(self.Dense_0.weight.data.shape)
-      nn.init.zeros_(self.Dense_0.bias)
-    self.GroupNorm_1 = nn.GroupNorm(num_groups=32, num_channels=out_ch, eps=1e-6)
-    self.Dropout_0 = nn.Dropout(dropout)
-    self.Conv_1 = ddpm_conv3x3(out_ch, out_ch, init_scale=0.)
-    if in_ch != out_ch:
-      if conv_shortcut:
-        self.Conv_2 = ddpm_conv3x3(in_ch, out_ch)
-      else:
-        self.NIN_0 = NIN(in_ch, out_ch)
-    self.out_ch = out_ch
-    self.in_ch = in_ch
-    self.conv_shortcut = conv_shortcut
-  def forward(self, x, temb=None):
-    B, C, H, W = x.shape
-    assert C == self.in_ch
-    out_ch = self.out_ch if self.out_ch else self.in_ch
-    h = self.act(self.GroupNorm_0(x))
-    h = self.Conv_0(h)
-    # Add bias to each feature map conditioned on the time embedding
-    if temb is not None:
-      h += self.Dense_0(self.act(temb))[:, :, None, None]
-    h = self.act(self.GroupNorm_1(h))
-    h = self.Dropout_0(h)
-    h = self.Conv_1(h)
-    if C != out_ch:
-      if self.conv_shortcut:
-        x = self.Conv_2(x)
-      else:
-        x = self.NIN_0(x)
     return x + h

+# coding=utf-8
+# Copyright 2020 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: skip-file
+"""Common layers for defining score networks.
+"""
+import math
+import string
+from functools import partial
+import torch.nn as nn
+import torch
+import torch.nn.functional as F
+import numpy as np
+from .normalization import ConditionalInstanceNorm2dPlus
+def get_act(config):
+  """Get activation functions from the config file."""
+  if config == 'elu':
+    return nn.ELU()
+  elif config == 'relu':
+    return nn.ReLU()
+  elif config == 'lrelu':
+    return nn.LeakyReLU(negative_slope=0.2)
+  elif config == 'swish':
+    return nn.SiLU()
+  else:
+    raise NotImplementedError('activation function does not exist!')
+def ncsn_conv1x1(in_planes, out_planes, stride=1, bias=True, dilation=1, init_scale=1., padding=0):
+  """1x1 convolution. Same as NCSNv1/v2."""
+  conv = nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=bias, dilation=dilation,
+                   padding=padding)
+  init_scale = 1e-10 if init_scale == 0 else init_scale
+  conv.weight.data *= init_scale
+  conv.bias.data *= init_scale
+  return conv
+def variance_scaling(scale, mode, distribution,
+                     in_axis=1, out_axis=0,
+                     dtype=torch.float32,
+                     device='cpu'):
+  """Ported from JAX. """
+  def _compute_fans(shape, in_axis=1, out_axis=0):
+    receptive_field_size = np.prod(shape) / shape[in_axis] / shape[out_axis]
+    fan_in = shape[in_axis] * receptive_field_size
+    fan_out = shape[out_axis] * receptive_field_size
+    return fan_in, fan_out
+  def init(shape, dtype=dtype, device=device):
+    fan_in, fan_out = _compute_fans(shape, in_axis, out_axis)
+    if mode == "fan_in":
+      denominator = fan_in
+    elif mode == "fan_out":
+      denominator = fan_out
+    elif mode == "fan_avg":
+      denominator = (fan_in + fan_out) / 2
+    else:
+      raise ValueError(
+        "invalid mode for variance scaling initializer: {}".format(mode))
+    variance = scale / denominator
+    if distribution == "normal":
+      return torch.randn(*shape, dtype=dtype, device=device) * np.sqrt(variance)
+    elif distribution == "uniform":
+      return (torch.rand(*shape, dtype=dtype, device=device) * 2. - 1.) * np.sqrt(3 * variance)
+    else:
+      raise ValueError("invalid distribution for variance scaling initializer")
+  return init
+def default_init(scale=1.):
+  """The same initialization used in DDPM."""
+  scale = 1e-10 if scale == 0 else scale
+  return variance_scaling(scale, 'fan_avg', 'uniform')
+class Dense(nn.Module):
+  """Linear layer with `default_init`."""
+  def __init__(self):
+    super().__init__()
+def ddpm_conv1x1(in_planes, out_planes, stride=1, bias=True, init_scale=1., padding=0):
+  """1x1 convolution with DDPM initialization."""
+  conv = nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, padding=padding, bias=bias)
+  conv.weight.data = default_init(init_scale)(conv.weight.data.shape)
+  nn.init.zeros_(conv.bias)
+  return conv
+def ncsn_conv3x3(in_planes, out_planes, stride=1, bias=True, dilation=1, init_scale=1., padding=1):
+  """3x3 convolution with PyTorch initialization. Same as NCSNv1/NCSNv2."""
+  init_scale = 1e-10 if init_scale == 0 else init_scale
+  conv = nn.Conv2d(in_planes, out_planes, stride=stride, bias=bias,
+                   dilation=dilation, padding=padding, kernel_size=3)
+  conv.weight.data *= init_scale
+  conv.bias.data *= init_scale
+  return conv
+def ddpm_conv3x3(in_planes, out_planes, stride=1, bias=True, dilation=1, init_scale=1., padding=1):
+  """3x3 convolution with DDPM initialization."""
+  conv = nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=padding,
+                   dilation=dilation, bias=bias)
+  conv.weight.data = default_init(init_scale)(conv.weight.data.shape)
+  nn.init.zeros_(conv.bias)
+  return conv
+  ###########################################################################
+  # Functions below are ported over from the NCSNv1/NCSNv2 codebase:
+  # https://github.com/ermongroup/ncsn
+  # https://github.com/ermongroup/ncsnv2
+  ###########################################################################
+class CRPBlock(nn.Module):
+  def __init__(self, features, n_stages, act=nn.ReLU(), maxpool=True):
+    super().__init__()
+    self.convs = nn.ModuleList()
+    for i in range(n_stages):
+      self.convs.append(ncsn_conv3x3(features, features, stride=1, bias=False))
+    self.n_stages = n_stages
+    if maxpool:
+      self.pool = nn.MaxPool2d(kernel_size=5, stride=1, padding=2)
+    else:
+      self.pool = nn.AvgPool2d(kernel_size=5, stride=1, padding=2)
+    self.act = act
+  def forward(self, x):
+    x = self.act(x)
+    path = x
+    for i in range(self.n_stages):
+      path = self.pool(path)
+      path = self.convs[i](path)
+      x = path + x
+    return x
+class CondCRPBlock(nn.Module):
+  def __init__(self, features, n_stages, num_classes, normalizer, act=nn.ReLU()):
+    super().__init__()
+    self.convs = nn.ModuleList()
+    self.norms = nn.ModuleList()
+    self.normalizer = normalizer
+    for i in range(n_stages):
+      self.norms.append(normalizer(features, num_classes, bias=True))
+      self.convs.append(ncsn_conv3x3(features, features, stride=1, bias=False))
+    self.n_stages = n_stages
+    self.pool = nn.AvgPool2d(kernel_size=5, stride=1, padding=2)
+    self.act = act
+  def forward(self, x, y):
+    x = self.act(x)
+    path = x
+    for i in range(self.n_stages):
+      path = self.norms[i](path, y)
+      path = self.pool(path)
+      path = self.convs[i](path)
+      x = path + x
+    return x
+class RCUBlock(nn.Module):
+  def __init__(self, features, n_blocks, n_stages, act=nn.ReLU()):
+    super().__init__()
+    for i in range(n_blocks):
+      for j in range(n_stages):
+        setattr(self, '{}_{}_conv'.format(i + 1, j + 1), ncsn_conv3x3(features, features, stride=1, bias=False))
+    self.stride = 1
+    self.n_blocks = n_blocks
+    self.n_stages = n_stages
+    self.act = act
+  def forward(self, x):
+    for i in range(self.n_blocks):
+      residual = x
+      for j in range(self.n_stages):
+        x = self.act(x)
+        x = getattr(self, '{}_{}_conv'.format(i + 1, j + 1))(x)
+      x += residual
+    return x
+class CondRCUBlock(nn.Module):
+  def __init__(self, features, n_blocks, n_stages, num_classes, normalizer, act=nn.ReLU()):
+    super().__init__()
+    for i in range(n_blocks):
+      for j in range(n_stages):
+        setattr(self, '{}_{}_norm'.format(i + 1, j + 1), normalizer(features, num_classes, bias=True))
+        setattr(self, '{}_{}_conv'.format(i + 1, j + 1), ncsn_conv3x3(features, features, stride=1, bias=False))
+    self.stride = 1
+    self.n_blocks = n_blocks
+    self.n_stages = n_stages
+    self.act = act
+    self.normalizer = normalizer
+  def forward(self, x, y):
+    for i in range(self.n_blocks):
+      residual = x
+      for j in range(self.n_stages):
+        x = getattr(self, '{}_{}_norm'.format(i + 1, j + 1))(x, y)
+        x = self.act(x)
+        x = getattr(self, '{}_{}_conv'.format(i + 1, j + 1))(x)
+      x += residual
+    return x
+class MSFBlock(nn.Module):
+  def __init__(self, in_planes, features):
+    super().__init__()
+    assert isinstance(in_planes, list) or isinstance(in_planes, tuple)
+    self.convs = nn.ModuleList()
+    self.features = features
+    for i in range(len(in_planes)):
+      self.convs.append(ncsn_conv3x3(in_planes[i], features, stride=1, bias=True))
+  def forward(self, xs, shape):
+    sums = torch.zeros(xs[0].shape[0], self.features, *shape, device=xs[0].device)
+    for i in range(len(self.convs)):
+      h = self.convs[i](xs[i])
+      h = F.interpolate(h, size=shape, mode='bilinear', align_corners=True)
+      sums += h
+    return sums
+class CondMSFBlock(nn.Module):
+  def __init__(self, in_planes, features, num_classes, normalizer):
+    super().__init__()
+    assert isinstance(in_planes, list) or isinstance(in_planes, tuple)
+    self.convs = nn.ModuleList()
+    self.norms = nn.ModuleList()
+    self.features = features
+    self.normalizer = normalizer
+    for i in range(len(in_planes)):
+      self.convs.append(ncsn_conv3x3(in_planes[i], features, stride=1, bias=True))
+      self.norms.append(normalizer(in_planes[i], num_classes, bias=True))
+  def forward(self, xs, y, shape):
+    sums = torch.zeros(xs[0].shape[0], self.features, *shape, device=xs[0].device)
+    for i in range(len(self.convs)):
+      h = self.norms[i](xs[i], y)
+      h = self.convs[i](h)
+      h = F.interpolate(h, size=shape, mode='bilinear', align_corners=True)
+      sums += h
+    return sums
+class RefineBlock(nn.Module):
+  def __init__(self, in_planes, features, act=nn.ReLU(), start=False, end=False, maxpool=True):
+    super().__init__()
+    assert isinstance(in_planes, tuple) or isinstance(in_planes, list)
+    self.n_blocks = n_blocks = len(in_planes)
+    self.adapt_convs = nn.ModuleList()
+    for i in range(n_blocks):
+      self.adapt_convs.append(RCUBlock(in_planes[i], 2, 2, act))
+    self.output_convs = RCUBlock(features, 3 if end else 1, 2, act)
+    if not start:
+      self.msf = MSFBlock(in_planes, features)
+    self.crp = CRPBlock(features, 2, act, maxpool=maxpool)
+  def forward(self, xs, output_shape):
+    assert isinstance(xs, tuple) or isinstance(xs, list)
+    hs = []
+    for i in range(len(xs)):
+      h = self.adapt_convs[i](xs[i])
+      hs.append(h)
+    if self.n_blocks > 1:
+      h = self.msf(hs, output_shape)
+    else:
+      h = hs[0]
+    h = self.crp(h)
+    h = self.output_convs(h)
+    return h
+class CondRefineBlock(nn.Module):
+  def __init__(self, in_planes, features, num_classes, normalizer, act=nn.ReLU(), start=False, end=False):
+    super().__init__()
+    assert isinstance(in_planes, tuple) or isinstance(in_planes, list)
+    self.n_blocks = n_blocks = len(in_planes)
+    self.adapt_convs = nn.ModuleList()
+    for i in range(n_blocks):
+      self.adapt_convs.append(
+        CondRCUBlock(in_planes[i], 2, 2, num_classes, normalizer, act)
+      )
+    self.output_convs = CondRCUBlock(features, 3 if end else 1, 2, num_classes, normalizer, act)
+    if not start:
+      self.msf = CondMSFBlock(in_planes, features, num_classes, normalizer)
+    self.crp = CondCRPBlock(features, 2, num_classes, normalizer, act)
+  def forward(self, xs, y, output_shape):
+    assert isinstance(xs, tuple) or isinstance(xs, list)
+    hs = []
+    for i in range(len(xs)):
+      h = self.adapt_convs[i](xs[i], y)
+      hs.append(h)
+    if self.n_blocks > 1:
+      h = self.msf(hs, y, output_shape)
+    else:
+      h = hs[0]
+    h = self.crp(h, y)
+    h = self.output_convs(h, y)
+    return h
+class ConvMeanPool(nn.Module):
+  def __init__(self, input_dim, output_dim, kernel_size=3, biases=True, adjust_padding=False):
+    super().__init__()
+    if not adjust_padding:
+      conv = nn.Conv2d(input_dim, output_dim, kernel_size, stride=1, padding=kernel_size // 2, bias=biases)
+      self.conv = conv
+    else:
+      conv = nn.Conv2d(input_dim, output_dim, kernel_size, stride=1, padding=kernel_size // 2, bias=biases)
+      self.conv = nn.Sequential(
+        nn.ZeroPad2d((1, 0, 1, 0)),
+        conv
+      )
+  def forward(self, inputs):
+    output = self.conv(inputs)
+    output = sum([output[:, :, ::2, ::2], output[:, :, 1::2, ::2],
+                  output[:, :, ::2, 1::2], output[:, :, 1::2, 1::2]]) / 4.
+    return output
+class MeanPoolConv(nn.Module):
+  def __init__(self, input_dim, output_dim, kernel_size=3, biases=True):
+    super().__init__()
+    self.conv = nn.Conv2d(input_dim, output_dim, kernel_size, stride=1, padding=kernel_size // 2, bias=biases)
+  def forward(self, inputs):
+    output = inputs
+    output = sum([output[:, :, ::2, ::2], output[:, :, 1::2, ::2],
+                  output[:, :, ::2, 1::2], output[:, :, 1::2, 1::2]]) / 4.
+    return self.conv(output)
+class UpsampleConv(nn.Module):
+  def __init__(self, input_dim, output_dim, kernel_size=3, biases=True):
+    super().__init__()
+    self.conv = nn.Conv2d(input_dim, output_dim, kernel_size, stride=1, padding=kernel_size // 2, bias=biases)
+    self.pixelshuffle = nn.PixelShuffle(upscale_factor=2)
+  def forward(self, inputs):
+    output = inputs
+    output = torch.cat([output, output, output, output], dim=1)
+    output = self.pixelshuffle(output)
+    return self.conv(output)
+class ConditionalResidualBlock(nn.Module):
+  def __init__(self, input_dim, output_dim, num_classes, resample=1, act=nn.ELU(),
+               normalization=ConditionalInstanceNorm2dPlus, adjust_padding=False, dilation=None):
+    super().__init__()
+    self.non_linearity = act
+    self.input_dim = input_dim
+    self.output_dim = output_dim
+    self.resample = resample
+    self.normalization = normalization
+    if resample == 'down':
+      if dilation > 1:
+        self.conv1 = ncsn_conv3x3(input_dim, input_dim, dilation=dilation)
+        self.normalize2 = normalization(input_dim, num_classes)
+        self.conv2 = ncsn_conv3x3(input_dim, output_dim, dilation=dilation)
+        conv_shortcut = partial(ncsn_conv3x3, dilation=dilation)
+      else:
+        self.conv1 = ncsn_conv3x3(input_dim, input_dim)
+        self.normalize2 = normalization(input_dim, num_classes)
+        self.conv2 = ConvMeanPool(input_dim, output_dim, 3, adjust_padding=adjust_padding)
+        conv_shortcut = partial(ConvMeanPool, kernel_size=1, adjust_padding=adjust_padding)
+    elif resample is None:
+      if dilation > 1:
+        conv_shortcut = partial(ncsn_conv3x3, dilation=dilation)
+        self.conv1 = ncsn_conv3x3(input_dim, output_dim, dilation=dilation)
+        self.normalize2 = normalization(output_dim, num_classes)
+        self.conv2 = ncsn_conv3x3(output_dim, output_dim, dilation=dilation)
+      else:
+        conv_shortcut = nn.Conv2d
+        self.conv1 = ncsn_conv3x3(input_dim, output_dim)
+        self.normalize2 = normalization(output_dim, num_classes)
+        self.conv2 = ncsn_conv3x3(output_dim, output_dim)
+    else:
+      raise Exception('invalid resample value')
+    if output_dim != input_dim or resample is not None:
+      self.shortcut = conv_shortcut(input_dim, output_dim)
+    self.normalize1 = normalization(input_dim, num_classes)
+  def forward(self, x, y):
+    output = self.normalize1(x, y)
+    output = self.non_linearity(output)
+    output = self.conv1(output)
+    output = self.normalize2(output, y)
+    output = self.non_linearity(output)
+    output = self.conv2(output)
+    if self.output_dim == self.input_dim and self.resample is None:
+      shortcut = x
+    else:
+      shortcut = self.shortcut(x)
+    return shortcut + output
+class ResidualBlock(nn.Module):
+  def __init__(self, input_dim, output_dim, resample=None, act=nn.ELU(),
+               normalization=nn.InstanceNorm2d, adjust_padding=False, dilation=1):
+    super().__init__()
+    self.non_linearity = act
+    self.input_dim = input_dim
+    self.output_dim = output_dim
+    self.resample = resample
+    self.normalization = normalization
+    if resample == 'down':
+      if dilation > 1:
+        self.conv1 = ncsn_conv3x3(input_dim, input_dim, dilation=dilation)
+        self.normalize2 = normalization(input_dim)
+        self.conv2 = ncsn_conv3x3(input_dim, output_dim, dilation=dilation)
+        conv_shortcut = partial(ncsn_conv3x3, dilation=dilation)
+      else:
+        self.conv1 = ncsn_conv3x3(input_dim, input_dim)
+        self.normalize2 = normalization(input_dim)
+        self.conv2 = ConvMeanPool(input_dim, output_dim, 3, adjust_padding=adjust_padding)
+        conv_shortcut = partial(ConvMeanPool, kernel_size=1, adjust_padding=adjust_padding)
+    elif resample is None:
+      if dilation > 1:
+        conv_shortcut = partial(ncsn_conv3x3, dilation=dilation)
+        self.conv1 = ncsn_conv3x3(input_dim, output_dim, dilation=dilation)
+        self.normalize2 = normalization(output_dim)
+        self.conv2 = ncsn_conv3x3(output_dim, output_dim, dilation=dilation)
+      else:
+        # conv_shortcut = nn.Conv2d ### Something wierd here.
+        conv_shortcut = partial(ncsn_conv1x1)
+        self.conv1 = ncsn_conv3x3(input_dim, output_dim)
+        self.normalize2 = normalization(output_dim)
+        self.conv2 = ncsn_conv3x3(output_dim, output_dim)
+    else:
+      raise Exception('invalid resample value')
+    if output_dim != input_dim or resample is not None:
+      self.shortcut = conv_shortcut(input_dim, output_dim)
+    self.normalize1 = normalization(input_dim)
+  def forward(self, x):
+    output = self.normalize1(x)
+    output = self.non_linearity(output)
+    output = self.conv1(output)
+    output = self.normalize2(output)
+    output = self.non_linearity(output)
+    output = self.conv2(output)
+    if self.output_dim == self.input_dim and self.resample is None:
+      shortcut = x
+    else:
+      shortcut = self.shortcut(x)
+    return shortcut + output
+###########################################################################
+# Functions below are ported over from the DDPM codebase:
+#  https://github.com/hojonathanho/diffusion/blob/master/diffusion_tf/nn.py
+###########################################################################
+def get_timestep_embedding(timesteps, embedding_dim, max_positions=10000):
+  assert len(timesteps.shape) == 1  # and timesteps.dtype == tf.int32
+  half_dim = embedding_dim // 2
+  # magic number 10000 is from transformers
+  emb = math.log(max_positions) / (half_dim - 1)
+  # emb = math.log(2.) / (half_dim - 1)
+  emb = torch.exp(torch.arange(half_dim, dtype=torch.float32, device=timesteps.device) * -emb)
+  # emb = tf.range(num_embeddings, dtype=jnp.float32)[:, None] * emb[None, :]
+  # emb = tf.cast(timesteps, dtype=jnp.float32)[:, None] * emb[None, :]
+  emb = timesteps.float()[:, None] * emb[None, :]
+  emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+  if embedding_dim % 2 == 1:  # zero pad
+    emb = F.pad(emb, (0, 1), mode='constant')
+  assert emb.shape == (timesteps.shape[0], embedding_dim)
+  return emb
+def _einsum(a, b, c, x, y):
+  einsum_str = '{},{}->{}'.format(''.join(a), ''.join(b), ''.join(c))
+  return torch.einsum(einsum_str, x, y)
+def contract_inner(x, y):
+  """tensordot(x, y, 1)."""
+  x_chars = list(string.ascii_lowercase[:len(x.shape)])
+  y_chars = list(string.ascii_lowercase[len(x.shape):len(y.shape) + len(x.shape)])
+  y_chars[0] = x_chars[-1]  # first axis of y and last of x get summed
+  out_chars = x_chars[:-1] + y_chars[1:]
+  return _einsum(x_chars, y_chars, out_chars, x, y)
+class NIN(nn.Module):
+  def __init__(self, in_dim, num_units, init_scale=0.1):
+    super().__init__()
+    self.W = nn.Parameter(default_init(scale=init_scale)((in_dim, num_units)), requires_grad=True)
+    self.b = nn.Parameter(torch.zeros(num_units), requires_grad=True)
+  def forward(self, x):
+    x = x.permute(0, 2, 3, 1)
+    y = contract_inner(x, self.W) + self.b
+    return y.permute(0, 3, 1, 2)
+class AttnBlock(nn.Module):
+  """Channel-wise self-attention block."""
+  def __init__(self, channels):
+    super().__init__()
+    self.GroupNorm_0 = nn.GroupNorm(num_groups=32, num_channels=channels, eps=1e-6)
+    self.NIN_0 = NIN(channels, channels)
+    self.NIN_1 = NIN(channels, channels)
+    self.NIN_2 = NIN(channels, channels)
+    self.NIN_3 = NIN(channels, channels, init_scale=0.)
+  def forward(self, x):
+    B, C, H, W = x.shape
+    h = self.GroupNorm_0(x)
+    q = self.NIN_0(h)
+    k = self.NIN_1(h)
+    v = self.NIN_2(h)
+    w = torch.einsum('bchw,bcij->bhwij', q, k) * (int(C) ** (-0.5))
+    w = torch.reshape(w, (B, H, W, H * W))
+    w = F.softmax(w, dim=-1)
+    w = torch.reshape(w, (B, H, W, H, W))
+    h = torch.einsum('bhwij,bcij->bchw', w, v)
+    h = self.NIN_3(h)
+    return x + h
+class Upsample(nn.Module):
+  def __init__(self, channels, with_conv=False):
+    super().__init__()
+    if with_conv:
+      self.Conv_0 = ddpm_conv3x3(channels, channels)
+    self.with_conv = with_conv
+  def forward(self, x):
+    B, C, H, W = x.shape
+    h = F.interpolate(x, (H * 2, W * 2), mode='nearest')
+    if self.with_conv:
+      h = self.Conv_0(h)
+    return h
+class Downsample(nn.Module):
+  def __init__(self, channels, with_conv=False):
+    super().__init__()
+    if with_conv:
+      self.Conv_0 = ddpm_conv3x3(channels, channels, stride=2, padding=0)
+    self.with_conv = with_conv
+  def forward(self, x):
+    B, C, H, W = x.shape
+    # Emulate 'SAME' padding
+    if self.with_conv:
+      x = F.pad(x, (0, 1, 0, 1))
+      x = self.Conv_0(x)
+    else:
+      x = F.avg_pool2d(x, kernel_size=2, stride=2, padding=0)
+    assert x.shape == (B, C, H // 2, W // 2)
+    return x
+class ResnetBlockDDPM(nn.Module):
+  """The ResNet Blocks used in DDPM."""
+  def __init__(self, act, in_ch, out_ch=None, temb_dim=None, conv_shortcut=False, dropout=0.1):
+    super().__init__()
+    if out_ch is None:
+      out_ch = in_ch
+    self.GroupNorm_0 = nn.GroupNorm(num_groups=32, num_channels=in_ch, eps=1e-6)
+    self.act = act
+    self.Conv_0 = ddpm_conv3x3(in_ch, out_ch)
+    if temb_dim is not None:
+      self.Dense_0 = nn.Linear(temb_dim, out_ch)
+      self.Dense_0.weight.data = default_init()(self.Dense_0.weight.data.shape)
+      nn.init.zeros_(self.Dense_0.bias)
+    self.GroupNorm_1 = nn.GroupNorm(num_groups=32, num_channels=out_ch, eps=1e-6)
+    self.Dropout_0 = nn.Dropout(dropout)
+    self.Conv_1 = ddpm_conv3x3(out_ch, out_ch, init_scale=0.)
+    if in_ch != out_ch:
+      if conv_shortcut:
+        self.Conv_2 = ddpm_conv3x3(in_ch, out_ch)
+      else:
+        self.NIN_0 = NIN(in_ch, out_ch)
+    self.out_ch = out_ch
+    self.in_ch = in_ch
+    self.conv_shortcut = conv_shortcut
+  def forward(self, x, temb=None):
+    B, C, H, W = x.shape
+    assert C == self.in_ch
+    out_ch = self.out_ch if self.out_ch else self.in_ch
+    h = self.act(self.GroupNorm_0(x))
+    h = self.Conv_0(h)
+    # Add bias to each feature map conditioned on the time embedding
+    if temb is not None:
+      h += self.Dense_0(self.act(temb))[:, :, None, None]
+    h = self.act(self.GroupNorm_1(h))
+    h = self.Dropout_0(h)
+    h = self.Conv_1(h)
+    if C != out_ch:
+      if self.conv_shortcut:
+        x = self.Conv_2(x)
+      else:
+        x = self.NIN_0(x)
     return x + h

sgmse/backbones/ncsnpp_utils/layerspp.py CHANGED Viewed

@@ -1,274 +1,274 @@
-# coding=utf-8
-# Copyright 2020 The Google Research Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# pylint: skip-file
-"""Layers for defining NCSN++.
-"""
-from . import layers
-from . import up_or_down_sampling
-import torch.nn as nn
-import torch
-import torch.nn.functional as F
-import numpy as np
-conv1x1 = layers.ddpm_conv1x1
-conv3x3 = layers.ddpm_conv3x3
-NIN = layers.NIN
-default_init = layers.default_init
-class GaussianFourierProjection(nn.Module):
-  """Gaussian Fourier embeddings for noise levels."""
-  def __init__(self, embedding_size=256, scale=1.0):
-    super().__init__()
-    self.W = nn.Parameter(torch.randn(embedding_size) * scale, requires_grad=False)
-  def forward(self, x):
-    x_proj = x[:, None] * self.W[None, :] * 2 * np.pi
-    return torch.cat([torch.sin(x_proj), torch.cos(x_proj)], dim=-1)
-class Combine(nn.Module):
-  """Combine information from skip connections."""
-  def __init__(self, dim1, dim2, method='cat'):
-    super().__init__()
-    self.Conv_0 = conv1x1(dim1, dim2)
-    self.method = method
-  def forward(self, x, y):
-    h = self.Conv_0(x)
-    if self.method == 'cat':
-      return torch.cat([h, y], dim=1)
-    elif self.method == 'sum':
-      return h + y
-    else:
-      raise ValueError(f'Method {self.method} not recognized.')
-class AttnBlockpp(nn.Module):
-  """Channel-wise self-attention block. Modified from DDPM."""
-  def __init__(self, channels, skip_rescale=False, init_scale=0.):
-    super().__init__()
-    self.GroupNorm_0 = nn.GroupNorm(num_groups=min(channels // 4, 32), num_channels=channels,
-                                  eps=1e-6)
-    self.NIN_0 = NIN(channels, channels)
-    self.NIN_1 = NIN(channels, channels)
-    self.NIN_2 = NIN(channels, channels)
-    self.NIN_3 = NIN(channels, channels, init_scale=init_scale)
-    self.skip_rescale = skip_rescale
-  def forward(self, x):
-    B, C, H, W = x.shape
-    h = self.GroupNorm_0(x)
-    q = self.NIN_0(h)
-    k = self.NIN_1(h)
-    v = self.NIN_2(h)
-    w = torch.einsum('bchw,bcij->bhwij', q, k) * (int(C) ** (-0.5))
-    w = torch.reshape(w, (B, H, W, H * W))
-    w = F.softmax(w, dim=-1)
-    w = torch.reshape(w, (B, H, W, H, W))
-    h = torch.einsum('bhwij,bcij->bchw', w, v)
-    h = self.NIN_3(h)
-    if not self.skip_rescale:
-      return x + h
-    else:
-      return (x + h) / np.sqrt(2.)
-class Upsample(nn.Module):
-  def __init__(self, in_ch=None, out_ch=None, with_conv=False, fir=False,
-               fir_kernel=(1, 3, 3, 1)):
-    super().__init__()
-    out_ch = out_ch if out_ch else in_ch
-    if not fir:
-      if with_conv:
-        self.Conv_0 = conv3x3(in_ch, out_ch)
-    else:
-      if with_conv:
-        self.Conv2d_0 = up_or_down_sampling.Conv2d(in_ch, out_ch,
-                                                 kernel=3, up=True,
-                                                 resample_kernel=fir_kernel,
-                                                 use_bias=True,
-                                                 kernel_init=default_init())
-    self.fir = fir
-    self.with_conv = with_conv
-    self.fir_kernel = fir_kernel
-    self.out_ch = out_ch
-  def forward(self, x):
-    B, C, H, W = x.shape
-    if not self.fir:
-      h = F.interpolate(x, (H * 2, W * 2), 'nearest')
-      if self.with_conv:
-        h = self.Conv_0(h)
-    else:
-      if not self.with_conv:
-        h = up_or_down_sampling.upsample_2d(x, self.fir_kernel, factor=2)
-      else:
-        h = self.Conv2d_0(x)
-    return h
-class Downsample(nn.Module):
-  def __init__(self, in_ch=None, out_ch=None, with_conv=False, fir=False,
-               fir_kernel=(1, 3, 3, 1)):
-    super().__init__()
-    out_ch = out_ch if out_ch else in_ch
-    if not fir:
-      if with_conv:
-        self.Conv_0 = conv3x3(in_ch, out_ch, stride=2, padding=0)
-    else:
-      if with_conv:
-        self.Conv2d_0 = up_or_down_sampling.Conv2d(in_ch, out_ch,
-                                                 kernel=3, down=True,
-                                                 resample_kernel=fir_kernel,
-                                                 use_bias=True,
-                                                 kernel_init=default_init())
-    self.fir = fir
-    self.fir_kernel = fir_kernel
-    self.with_conv = with_conv
-    self.out_ch = out_ch
-  def forward(self, x):
-    B, C, H, W = x.shape
-    if not self.fir:
-      if self.with_conv:
-        x = F.pad(x, (0, 1, 0, 1))
-        x = self.Conv_0(x)
-      else:
-        x = F.avg_pool2d(x, 2, stride=2)
-    else:
-      if not self.with_conv:
-        x = up_or_down_sampling.downsample_2d(x, self.fir_kernel, factor=2)
-      else:
-        x = self.Conv2d_0(x)
-    return x
-class ResnetBlockDDPMpp(nn.Module):
-  """ResBlock adapted from DDPM."""
-  def __init__(self, act, in_ch, out_ch=None, temb_dim=None, conv_shortcut=False,
-               dropout=0.1, skip_rescale=False, init_scale=0.):
-    super().__init__()
-    out_ch = out_ch if out_ch else in_ch
-    self.GroupNorm_0 = nn.GroupNorm(num_groups=min(in_ch // 4, 32), num_channels=in_ch, eps=1e-6)
-    self.Conv_0 = conv3x3(in_ch, out_ch)
-    if temb_dim is not None:
-      self.Dense_0 = nn.Linear(temb_dim, out_ch)
-      self.Dense_0.weight.data = default_init()(self.Dense_0.weight.data.shape)
-      nn.init.zeros_(self.Dense_0.bias)
-    self.GroupNorm_1 = nn.GroupNorm(num_groups=min(out_ch // 4, 32), num_channels=out_ch, eps=1e-6)
-    self.Dropout_0 = nn.Dropout(dropout)
-    self.Conv_1 = conv3x3(out_ch, out_ch, init_scale=init_scale)
-    if in_ch != out_ch:
-      if conv_shortcut:
-        self.Conv_2 = conv3x3(in_ch, out_ch)
-      else:
-        self.NIN_0 = NIN(in_ch, out_ch)
-    self.skip_rescale = skip_rescale
-    self.act = act
-    self.out_ch = out_ch
-    self.conv_shortcut = conv_shortcut
-  def forward(self, x, temb=None):
-    h = self.act(self.GroupNorm_0(x))
-    h = self.Conv_0(h)
-    if temb is not None:
-      h += self.Dense_0(self.act(temb))[:, :, None, None]
-    h = self.act(self.GroupNorm_1(h))
-    h = self.Dropout_0(h)
-    h = self.Conv_1(h)
-    if x.shape[1] != self.out_ch:
-      if self.conv_shortcut:
-        x = self.Conv_2(x)
-      else:
-        x = self.NIN_0(x)
-    if not self.skip_rescale:
-      return x + h
-    else:
-      return (x + h) / np.sqrt(2.)
-class ResnetBlockBigGANpp(nn.Module):
-  def __init__(self, act, in_ch, out_ch=None, temb_dim=None, up=False, down=False,
-               dropout=0.1, fir=False, fir_kernel=(1, 3, 3, 1),
-               skip_rescale=True, init_scale=0.):
-    super().__init__()
-    out_ch = out_ch if out_ch else in_ch
-    self.GroupNorm_0 = nn.GroupNorm(num_groups=min(in_ch // 4, 32), num_channels=in_ch, eps=1e-6)
-    self.up = up
-    self.down = down
-    self.fir = fir
-    self.fir_kernel = fir_kernel
-    self.Conv_0 = conv3x3(in_ch, out_ch)
-    if temb_dim is not None:
-      self.Dense_0 = nn.Linear(temb_dim, out_ch)
-      self.Dense_0.weight.data = default_init()(self.Dense_0.weight.shape)
-      nn.init.zeros_(self.Dense_0.bias)
-    self.GroupNorm_1 = nn.GroupNorm(num_groups=min(out_ch // 4, 32), num_channels=out_ch, eps=1e-6)
-    self.Dropout_0 = nn.Dropout(dropout)
-    self.Conv_1 = conv3x3(out_ch, out_ch, init_scale=init_scale)
-    if in_ch != out_ch or up or down:
-      self.Conv_2 = conv1x1(in_ch, out_ch)
-    self.skip_rescale = skip_rescale
-    self.act = act
-    self.in_ch = in_ch
-    self.out_ch = out_ch
-  def forward(self, x, temb=None):
-    h = self.act(self.GroupNorm_0(x))
-    if self.up:
-      if self.fir:
-        h = up_or_down_sampling.upsample_2d(h, self.fir_kernel, factor=2)
-        x = up_or_down_sampling.upsample_2d(x, self.fir_kernel, factor=2)
-      else:
-        h = up_or_down_sampling.naive_upsample_2d(h, factor=2)
-        x = up_or_down_sampling.naive_upsample_2d(x, factor=2)
-    elif self.down:
-      if self.fir:
-        h = up_or_down_sampling.downsample_2d(h, self.fir_kernel, factor=2)
-        x = up_or_down_sampling.downsample_2d(x, self.fir_kernel, factor=2)
-      else:
-        h = up_or_down_sampling.naive_downsample_2d(h, factor=2)
-        x = up_or_down_sampling.naive_downsample_2d(x, factor=2)
-    h = self.Conv_0(h)
-    # Add bias to each feature map conditioned on the time embedding
-    if temb is not None:
-      h += self.Dense_0(self.act(temb))[:, :, None, None]
-    h = self.act(self.GroupNorm_1(h))
-    h = self.Dropout_0(h)
-    h = self.Conv_1(h)
-    if self.in_ch != self.out_ch or self.up or self.down:
-      x = self.Conv_2(x)
-    if not self.skip_rescale:
-      return x + h
-    else:
-      return (x + h) / np.sqrt(2.)

+# coding=utf-8
+# Copyright 2020 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: skip-file
+"""Layers for defining NCSN++.
+"""
+from . import layers
+from . import up_or_down_sampling
+import torch.nn as nn
+import torch
+import torch.nn.functional as F
+import numpy as np
+conv1x1 = layers.ddpm_conv1x1
+conv3x3 = layers.ddpm_conv3x3
+NIN = layers.NIN
+default_init = layers.default_init
+class GaussianFourierProjection(nn.Module):
+  """Gaussian Fourier embeddings for noise levels."""
+  def __init__(self, embedding_size=256, scale=1.0):
+    super().__init__()
+    self.W = nn.Parameter(torch.randn(embedding_size) * scale, requires_grad=False)
+  def forward(self, x):
+    x_proj = x[:, None] * self.W[None, :] * 2 * np.pi
+    return torch.cat([torch.sin(x_proj), torch.cos(x_proj)], dim=-1)
+class Combine(nn.Module):
+  """Combine information from skip connections."""
+  def __init__(self, dim1, dim2, method='cat'):
+    super().__init__()
+    self.Conv_0 = conv1x1(dim1, dim2)
+    self.method = method
+  def forward(self, x, y):
+    h = self.Conv_0(x)
+    if self.method == 'cat':
+      return torch.cat([h, y], dim=1)
+    elif self.method == 'sum':
+      return h + y
+    else:
+      raise ValueError(f'Method {self.method} not recognized.')
+class AttnBlockpp(nn.Module):
+  """Channel-wise self-attention block. Modified from DDPM."""
+  def __init__(self, channels, skip_rescale=False, init_scale=0.):
+    super().__init__()
+    self.GroupNorm_0 = nn.GroupNorm(num_groups=min(channels // 4, 32), num_channels=channels,
+                                  eps=1e-6)
+    self.NIN_0 = NIN(channels, channels)
+    self.NIN_1 = NIN(channels, channels)
+    self.NIN_2 = NIN(channels, channels)
+    self.NIN_3 = NIN(channels, channels, init_scale=init_scale)
+    self.skip_rescale = skip_rescale
+  def forward(self, x):
+    B, C, H, W = x.shape
+    h = self.GroupNorm_0(x)
+    q = self.NIN_0(h)
+    k = self.NIN_1(h)
+    v = self.NIN_2(h)
+    w = torch.einsum('bchw,bcij->bhwij', q, k) * (int(C) ** (-0.5))
+    w = torch.reshape(w, (B, H, W, H * W))
+    w = F.softmax(w, dim=-1)
+    w = torch.reshape(w, (B, H, W, H, W))
+    h = torch.einsum('bhwij,bcij->bchw', w, v)
+    h = self.NIN_3(h)
+    if not self.skip_rescale:
+      return x + h
+    else:
+      return (x + h) / np.sqrt(2.)
+class Upsample(nn.Module):
+  def __init__(self, in_ch=None, out_ch=None, with_conv=False, fir=False,
+               fir_kernel=(1, 3, 3, 1)):
+    super().__init__()
+    out_ch = out_ch if out_ch else in_ch
+    if not fir:
+      if with_conv:
+        self.Conv_0 = conv3x3(in_ch, out_ch)
+    else:
+      if with_conv:
+        self.Conv2d_0 = up_or_down_sampling.Conv2d(in_ch, out_ch,
+                                                 kernel=3, up=True,
+                                                 resample_kernel=fir_kernel,
+                                                 use_bias=True,
+                                                 kernel_init=default_init())
+    self.fir = fir
+    self.with_conv = with_conv
+    self.fir_kernel = fir_kernel
+    self.out_ch = out_ch
+  def forward(self, x):
+    B, C, H, W = x.shape
+    if not self.fir:
+      h = F.interpolate(x, (H * 2, W * 2), 'nearest')
+      if self.with_conv:
+        h = self.Conv_0(h)
+    else:
+      if not self.with_conv:
+        h = up_or_down_sampling.upsample_2d(x, self.fir_kernel, factor=2)
+      else:
+        h = self.Conv2d_0(x)
+    return h
+class Downsample(nn.Module):
+  def __init__(self, in_ch=None, out_ch=None, with_conv=False, fir=False,
+               fir_kernel=(1, 3, 3, 1)):
+    super().__init__()
+    out_ch = out_ch if out_ch else in_ch
+    if not fir:
+      if with_conv:
+        self.Conv_0 = conv3x3(in_ch, out_ch, stride=2, padding=0)
+    else:
+      if with_conv:
+        self.Conv2d_0 = up_or_down_sampling.Conv2d(in_ch, out_ch,
+                                                 kernel=3, down=True,
+                                                 resample_kernel=fir_kernel,
+                                                 use_bias=True,
+                                                 kernel_init=default_init())
+    self.fir = fir
+    self.fir_kernel = fir_kernel
+    self.with_conv = with_conv
+    self.out_ch = out_ch
+  def forward(self, x):
+    B, C, H, W = x.shape
+    if not self.fir:
+      if self.with_conv:
+        x = F.pad(x, (0, 1, 0, 1))
+        x = self.Conv_0(x)
+      else:
+        x = F.avg_pool2d(x, 2, stride=2)
+    else:
+      if not self.with_conv:
+        x = up_or_down_sampling.downsample_2d(x, self.fir_kernel, factor=2)
+      else:
+        x = self.Conv2d_0(x)
+    return x
+class ResnetBlockDDPMpp(nn.Module):
+  """ResBlock adapted from DDPM."""
+  def __init__(self, act, in_ch, out_ch=None, temb_dim=None, conv_shortcut=False,
+               dropout=0.1, skip_rescale=False, init_scale=0.):
+    super().__init__()
+    out_ch = out_ch if out_ch else in_ch
+    self.GroupNorm_0 = nn.GroupNorm(num_groups=min(in_ch // 4, 32), num_channels=in_ch, eps=1e-6)
+    self.Conv_0 = conv3x3(in_ch, out_ch)
+    if temb_dim is not None:
+      self.Dense_0 = nn.Linear(temb_dim, out_ch)
+      self.Dense_0.weight.data = default_init()(self.Dense_0.weight.data.shape)
+      nn.init.zeros_(self.Dense_0.bias)
+    self.GroupNorm_1 = nn.GroupNorm(num_groups=min(out_ch // 4, 32), num_channels=out_ch, eps=1e-6)
+    self.Dropout_0 = nn.Dropout(dropout)
+    self.Conv_1 = conv3x3(out_ch, out_ch, init_scale=init_scale)
+    if in_ch != out_ch:
+      if conv_shortcut:
+        self.Conv_2 = conv3x3(in_ch, out_ch)
+      else:
+        self.NIN_0 = NIN(in_ch, out_ch)
+    self.skip_rescale = skip_rescale
+    self.act = act
+    self.out_ch = out_ch
+    self.conv_shortcut = conv_shortcut
+  def forward(self, x, temb=None):
+    h = self.act(self.GroupNorm_0(x))
+    h = self.Conv_0(h)
+    if temb is not None:
+      h += self.Dense_0(self.act(temb))[:, :, None, None]
+    h = self.act(self.GroupNorm_1(h))
+    h = self.Dropout_0(h)
+    h = self.Conv_1(h)
+    if x.shape[1] != self.out_ch:
+      if self.conv_shortcut:
+        x = self.Conv_2(x)
+      else:
+        x = self.NIN_0(x)
+    if not self.skip_rescale:
+      return x + h
+    else:
+      return (x + h) / np.sqrt(2.)
+class ResnetBlockBigGANpp(nn.Module):
+  def __init__(self, act, in_ch, out_ch=None, temb_dim=None, up=False, down=False,
+               dropout=0.1, fir=False, fir_kernel=(1, 3, 3, 1),
+               skip_rescale=True, init_scale=0.):
+    super().__init__()
+    out_ch = out_ch if out_ch else in_ch
+    self.GroupNorm_0 = nn.GroupNorm(num_groups=min(in_ch // 4, 32), num_channels=in_ch, eps=1e-6)
+    self.up = up
+    self.down = down
+    self.fir = fir
+    self.fir_kernel = fir_kernel
+    self.Conv_0 = conv3x3(in_ch, out_ch)
+    if temb_dim is not None:
+      self.Dense_0 = nn.Linear(temb_dim, out_ch)
+      self.Dense_0.weight.data = default_init()(self.Dense_0.weight.shape)
+      nn.init.zeros_(self.Dense_0.bias)
+    self.GroupNorm_1 = nn.GroupNorm(num_groups=min(out_ch // 4, 32), num_channels=out_ch, eps=1e-6)
+    self.Dropout_0 = nn.Dropout(dropout)
+    self.Conv_1 = conv3x3(out_ch, out_ch, init_scale=init_scale)
+    if in_ch != out_ch or up or down:
+      self.Conv_2 = conv1x1(in_ch, out_ch)
+    self.skip_rescale = skip_rescale
+    self.act = act
+    self.in_ch = in_ch
+    self.out_ch = out_ch
+  def forward(self, x, temb=None):
+    h = self.act(self.GroupNorm_0(x))
+    if self.up:
+      if self.fir:
+        h = up_or_down_sampling.upsample_2d(h, self.fir_kernel, factor=2)
+        x = up_or_down_sampling.upsample_2d(x, self.fir_kernel, factor=2)
+      else:
+        h = up_or_down_sampling.naive_upsample_2d(h, factor=2)
+        x = up_or_down_sampling.naive_upsample_2d(x, factor=2)
+    elif self.down:
+      if self.fir:
+        h = up_or_down_sampling.downsample_2d(h, self.fir_kernel, factor=2)
+        x = up_or_down_sampling.downsample_2d(x, self.fir_kernel, factor=2)
+      else:
+        h = up_or_down_sampling.naive_downsample_2d(h, factor=2)
+        x = up_or_down_sampling.naive_downsample_2d(x, factor=2)
+    h = self.Conv_0(h)
+    # Add bias to each feature map conditioned on the time embedding
+    if temb is not None:
+      h += self.Dense_0(self.act(temb))[:, :, None, None]
+    h = self.act(self.GroupNorm_1(h))
+    h = self.Dropout_0(h)
+    h = self.Conv_1(h)
+    if self.in_ch != self.out_ch or self.up or self.down:
+      x = self.Conv_2(x)
+    if not self.skip_rescale:
+      return x + h
+    else:
+      return (x + h) / np.sqrt(2.)

sgmse/backbones/ncsnpp_utils/normalization.py CHANGED Viewed

@@ -1,215 +1,215 @@
-# coding=utf-8
-# Copyright 2020 The Google Research Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Normalization layers."""
-import torch.nn as nn
-import torch
-import functools
-def get_normalization(config, conditional=False):
-  """Obtain normalization modules from the config file."""
-  norm = config.model.normalization
-  if conditional:
-    if norm == 'InstanceNorm++':
-      return functools.partial(ConditionalInstanceNorm2dPlus, num_classes=config.model.num_classes)
-    else:
-      raise NotImplementedError(f'{norm} not implemented yet.')
-  else:
-    if norm == 'InstanceNorm':
-      return nn.InstanceNorm2d
-    elif norm == 'InstanceNorm++':
-      return InstanceNorm2dPlus
-    elif norm == 'VarianceNorm':
-      return VarianceNorm2d
-    elif norm == 'GroupNorm':
-      return nn.GroupNorm
-    else:
-      raise ValueError('Unknown normalization: %s' % norm)
-class ConditionalBatchNorm2d(nn.Module):
-  def __init__(self, num_features, num_classes, bias=True):
-    super().__init__()
-    self.num_features = num_features
-    self.bias = bias
-    self.bn = nn.BatchNorm2d(num_features, affine=False)
-    if self.bias:
-      self.embed = nn.Embedding(num_classes, num_features * 2)
-      self.embed.weight.data[:, :num_features].uniform_()  # Initialise scale at N(1, 0.02)
-      self.embed.weight.data[:, num_features:].zero_()  # Initialise bias at 0
-    else:
-      self.embed = nn.Embedding(num_classes, num_features)
-      self.embed.weight.data.uniform_()
-  def forward(self, x, y):
-    out = self.bn(x)
-    if self.bias:
-      gamma, beta = self.embed(y).chunk(2, dim=1)
-      out = gamma.view(-1, self.num_features, 1, 1) * out + beta.view(-1, self.num_features, 1, 1)
-    else:
-      gamma = self.embed(y)
-      out = gamma.view(-1, self.num_features, 1, 1) * out
-    return out
-class ConditionalInstanceNorm2d(nn.Module):
-  def __init__(self, num_features, num_classes, bias=True):
-    super().__init__()
-    self.num_features = num_features
-    self.bias = bias
-    self.instance_norm = nn.InstanceNorm2d(num_features, affine=False, track_running_stats=False)
-    if bias:
-      self.embed = nn.Embedding(num_classes, num_features * 2)
-      self.embed.weight.data[:, :num_features].uniform_()  # Initialise scale at N(1, 0.02)
-      self.embed.weight.data[:, num_features:].zero_()  # Initialise bias at 0
-    else:
-      self.embed = nn.Embedding(num_classes, num_features)
-      self.embed.weight.data.uniform_()
-  def forward(self, x, y):
-    h = self.instance_norm(x)
-    if self.bias:
-      gamma, beta = self.embed(y).chunk(2, dim=-1)
-      out = gamma.view(-1, self.num_features, 1, 1) * h + beta.view(-1, self.num_features, 1, 1)
-    else:
-      gamma = self.embed(y)
-      out = gamma.view(-1, self.num_features, 1, 1) * h
-    return out
-class ConditionalVarianceNorm2d(nn.Module):
-  def __init__(self, num_features, num_classes, bias=False):
-    super().__init__()
-    self.num_features = num_features
-    self.bias = bias
-    self.embed = nn.Embedding(num_classes, num_features)
-    self.embed.weight.data.normal_(1, 0.02)
-  def forward(self, x, y):
-    vars = torch.var(x, dim=(2, 3), keepdim=True)
-    h = x / torch.sqrt(vars + 1e-5)
-    gamma = self.embed(y)
-    out = gamma.view(-1, self.num_features, 1, 1) * h
-    return out
-class VarianceNorm2d(nn.Module):
-  def __init__(self, num_features, bias=False):
-    super().__init__()
-    self.num_features = num_features
-    self.bias = bias
-    self.alpha = nn.Parameter(torch.zeros(num_features))
-    self.alpha.data.normal_(1, 0.02)
-  def forward(self, x):
-    vars = torch.var(x, dim=(2, 3), keepdim=True)
-    h = x / torch.sqrt(vars + 1e-5)
-    out = self.alpha.view(-1, self.num_features, 1, 1) * h
-    return out
-class ConditionalNoneNorm2d(nn.Module):
-  def __init__(self, num_features, num_classes, bias=True):
-    super().__init__()
-    self.num_features = num_features
-    self.bias = bias
-    if bias:
-      self.embed = nn.Embedding(num_classes, num_features * 2)
-      self.embed.weight.data[:, :num_features].uniform_()  # Initialise scale at N(1, 0.02)
-      self.embed.weight.data[:, num_features:].zero_()  # Initialise bias at 0
-    else:
-      self.embed = nn.Embedding(num_classes, num_features)
-      self.embed.weight.data.uniform_()
-  def forward(self, x, y):
-    if self.bias:
-      gamma, beta = self.embed(y).chunk(2, dim=-1)
-      out = gamma.view(-1, self.num_features, 1, 1) * x + beta.view(-1, self.num_features, 1, 1)
-    else:
-      gamma = self.embed(y)
-      out = gamma.view(-1, self.num_features, 1, 1) * x
-    return out
-class NoneNorm2d(nn.Module):
-  def __init__(self, num_features, bias=True):
-    super().__init__()
-  def forward(self, x):
-    return x
-class InstanceNorm2dPlus(nn.Module):
-  def __init__(self, num_features, bias=True):
-    super().__init__()
-    self.num_features = num_features
-    self.bias = bias
-    self.instance_norm = nn.InstanceNorm2d(num_features, affine=False, track_running_stats=False)
-    self.alpha = nn.Parameter(torch.zeros(num_features))
-    self.gamma = nn.Parameter(torch.zeros(num_features))
-    self.alpha.data.normal_(1, 0.02)
-    self.gamma.data.normal_(1, 0.02)
-    if bias:
-      self.beta = nn.Parameter(torch.zeros(num_features))
-  def forward(self, x):
-    means = torch.mean(x, dim=(2, 3))
-    m = torch.mean(means, dim=-1, keepdim=True)
-    v = torch.var(means, dim=-1, keepdim=True)
-    means = (means - m) / (torch.sqrt(v + 1e-5))
-    h = self.instance_norm(x)
-    if self.bias:
-      h = h + means[..., None, None] * self.alpha[..., None, None]
-      out = self.gamma.view(-1, self.num_features, 1, 1) * h + self.beta.view(-1, self.num_features, 1, 1)
-    else:
-      h = h + means[..., None, None] * self.alpha[..., None, None]
-      out = self.gamma.view(-1, self.num_features, 1, 1) * h
-    return out
-class ConditionalInstanceNorm2dPlus(nn.Module):
-  def __init__(self, num_features, num_classes, bias=True):
-    super().__init__()
-    self.num_features = num_features
-    self.bias = bias
-    self.instance_norm = nn.InstanceNorm2d(num_features, affine=False, track_running_stats=False)
-    if bias:
-      self.embed = nn.Embedding(num_classes, num_features * 3)
-      self.embed.weight.data[:, :2 * num_features].normal_(1, 0.02)  # Initialise scale at N(1, 0.02)
-      self.embed.weight.data[:, 2 * num_features:].zero_()  # Initialise bias at 0
-    else:
-      self.embed = nn.Embedding(num_classes, 2 * num_features)
-      self.embed.weight.data.normal_(1, 0.02)
-  def forward(self, x, y):
-    means = torch.mean(x, dim=(2, 3))
-    m = torch.mean(means, dim=-1, keepdim=True)
-    v = torch.var(means, dim=-1, keepdim=True)
-    means = (means - m) / (torch.sqrt(v + 1e-5))
-    h = self.instance_norm(x)
-    if self.bias:
-      gamma, alpha, beta = self.embed(y).chunk(3, dim=-1)
-      h = h + means[..., None, None] * alpha[..., None, None]
-      out = gamma.view(-1, self.num_features, 1, 1) * h + beta.view(-1, self.num_features, 1, 1)
-    else:
-      gamma, alpha = self.embed(y).chunk(2, dim=-1)
-      h = h + means[..., None, None] * alpha[..., None, None]
-      out = gamma.view(-1, self.num_features, 1, 1) * h
-    return out

+# coding=utf-8
+# Copyright 2020 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Normalization layers."""
+import torch.nn as nn
+import torch
+import functools
+def get_normalization(config, conditional=False):
+  """Obtain normalization modules from the config file."""
+  norm = config.model.normalization
+  if conditional:
+    if norm == 'InstanceNorm++':
+      return functools.partial(ConditionalInstanceNorm2dPlus, num_classes=config.model.num_classes)
+    else:
+      raise NotImplementedError(f'{norm} not implemented yet.')
+  else:
+    if norm == 'InstanceNorm':
+      return nn.InstanceNorm2d
+    elif norm == 'InstanceNorm++':
+      return InstanceNorm2dPlus
+    elif norm == 'VarianceNorm':
+      return VarianceNorm2d
+    elif norm == 'GroupNorm':
+      return nn.GroupNorm
+    else:
+      raise ValueError('Unknown normalization: %s' % norm)
+class ConditionalBatchNorm2d(nn.Module):
+  def __init__(self, num_features, num_classes, bias=True):
+    super().__init__()
+    self.num_features = num_features
+    self.bias = bias
+    self.bn = nn.BatchNorm2d(num_features, affine=False)
+    if self.bias:
+      self.embed = nn.Embedding(num_classes, num_features * 2)
+      self.embed.weight.data[:, :num_features].uniform_()  # Initialise scale at N(1, 0.02)
+      self.embed.weight.data[:, num_features:].zero_()  # Initialise bias at 0
+    else:
+      self.embed = nn.Embedding(num_classes, num_features)
+      self.embed.weight.data.uniform_()
+  def forward(self, x, y):
+    out = self.bn(x)
+    if self.bias:
+      gamma, beta = self.embed(y).chunk(2, dim=1)
+      out = gamma.view(-1, self.num_features, 1, 1) * out + beta.view(-1, self.num_features, 1, 1)
+    else:
+      gamma = self.embed(y)
+      out = gamma.view(-1, self.num_features, 1, 1) * out
+    return out
+class ConditionalInstanceNorm2d(nn.Module):
+  def __init__(self, num_features, num_classes, bias=True):
+    super().__init__()
+    self.num_features = num_features
+    self.bias = bias
+    self.instance_norm = nn.InstanceNorm2d(num_features, affine=False, track_running_stats=False)
+    if bias:
+      self.embed = nn.Embedding(num_classes, num_features * 2)
+      self.embed.weight.data[:, :num_features].uniform_()  # Initialise scale at N(1, 0.02)
+      self.embed.weight.data[:, num_features:].zero_()  # Initialise bias at 0
+    else:
+      self.embed = nn.Embedding(num_classes, num_features)
+      self.embed.weight.data.uniform_()
+  def forward(self, x, y):
+    h = self.instance_norm(x)
+    if self.bias:
+      gamma, beta = self.embed(y).chunk(2, dim=-1)
+      out = gamma.view(-1, self.num_features, 1, 1) * h + beta.view(-1, self.num_features, 1, 1)
+    else:
+      gamma = self.embed(y)
+      out = gamma.view(-1, self.num_features, 1, 1) * h
+    return out
+class ConditionalVarianceNorm2d(nn.Module):
+  def __init__(self, num_features, num_classes, bias=False):
+    super().__init__()
+    self.num_features = num_features
+    self.bias = bias
+    self.embed = nn.Embedding(num_classes, num_features)
+    self.embed.weight.data.normal_(1, 0.02)
+  def forward(self, x, y):
+    vars = torch.var(x, dim=(2, 3), keepdim=True)
+    h = x / torch.sqrt(vars + 1e-5)
+    gamma = self.embed(y)
+    out = gamma.view(-1, self.num_features, 1, 1) * h
+    return out
+class VarianceNorm2d(nn.Module):
+  def __init__(self, num_features, bias=False):
+    super().__init__()
+    self.num_features = num_features
+    self.bias = bias
+    self.alpha = nn.Parameter(torch.zeros(num_features))
+    self.alpha.data.normal_(1, 0.02)
+  def forward(self, x):
+    vars = torch.var(x, dim=(2, 3), keepdim=True)
+    h = x / torch.sqrt(vars + 1e-5)
+    out = self.alpha.view(-1, self.num_features, 1, 1) * h
+    return out
+class ConditionalNoneNorm2d(nn.Module):
+  def __init__(self, num_features, num_classes, bias=True):
+    super().__init__()
+    self.num_features = num_features
+    self.bias = bias
+    if bias:
+      self.embed = nn.Embedding(num_classes, num_features * 2)
+      self.embed.weight.data[:, :num_features].uniform_()  # Initialise scale at N(1, 0.02)
+      self.embed.weight.data[:, num_features:].zero_()  # Initialise bias at 0
+    else:
+      self.embed = nn.Embedding(num_classes, num_features)
+      self.embed.weight.data.uniform_()
+  def forward(self, x, y):
+    if self.bias:
+      gamma, beta = self.embed(y).chunk(2, dim=-1)
+      out = gamma.view(-1, self.num_features, 1, 1) * x + beta.view(-1, self.num_features, 1, 1)
+    else:
+      gamma = self.embed(y)
+      out = gamma.view(-1, self.num_features, 1, 1) * x
+    return out
+class NoneNorm2d(nn.Module):
+  def __init__(self, num_features, bias=True):
+    super().__init__()
+  def forward(self, x):
+    return x
+class InstanceNorm2dPlus(nn.Module):
+  def __init__(self, num_features, bias=True):
+    super().__init__()
+    self.num_features = num_features
+    self.bias = bias
+    self.instance_norm = nn.InstanceNorm2d(num_features, affine=False, track_running_stats=False)
+    self.alpha = nn.Parameter(torch.zeros(num_features))
+    self.gamma = nn.Parameter(torch.zeros(num_features))
+    self.alpha.data.normal_(1, 0.02)
+    self.gamma.data.normal_(1, 0.02)
+    if bias:
+      self.beta = nn.Parameter(torch.zeros(num_features))
+  def forward(self, x):
+    means = torch.mean(x, dim=(2, 3))
+    m = torch.mean(means, dim=-1, keepdim=True)
+    v = torch.var(means, dim=-1, keepdim=True)
+    means = (means - m) / (torch.sqrt(v + 1e-5))
+    h = self.instance_norm(x)
+    if self.bias:
+      h = h + means[..., None, None] * self.alpha[..., None, None]
+      out = self.gamma.view(-1, self.num_features, 1, 1) * h + self.beta.view(-1, self.num_features, 1, 1)
+    else:
+      h = h + means[..., None, None] * self.alpha[..., None, None]
+      out = self.gamma.view(-1, self.num_features, 1, 1) * h
+    return out
+class ConditionalInstanceNorm2dPlus(nn.Module):
+  def __init__(self, num_features, num_classes, bias=True):
+    super().__init__()
+    self.num_features = num_features
+    self.bias = bias
+    self.instance_norm = nn.InstanceNorm2d(num_features, affine=False, track_running_stats=False)
+    if bias:
+      self.embed = nn.Embedding(num_classes, num_features * 3)
+      self.embed.weight.data[:, :2 * num_features].normal_(1, 0.02)  # Initialise scale at N(1, 0.02)
+      self.embed.weight.data[:, 2 * num_features:].zero_()  # Initialise bias at 0
+    else:
+      self.embed = nn.Embedding(num_classes, 2 * num_features)
+      self.embed.weight.data.normal_(1, 0.02)
+  def forward(self, x, y):
+    means = torch.mean(x, dim=(2, 3))
+    m = torch.mean(means, dim=-1, keepdim=True)
+    v = torch.var(means, dim=-1, keepdim=True)
+    means = (means - m) / (torch.sqrt(v + 1e-5))
+    h = self.instance_norm(x)
+    if self.bias:
+      gamma, alpha, beta = self.embed(y).chunk(3, dim=-1)
+      h = h + means[..., None, None] * alpha[..., None, None]
+      out = gamma.view(-1, self.num_features, 1, 1) * h + beta.view(-1, self.num_features, 1, 1)
+    else:
+      gamma, alpha = self.embed(y).chunk(2, dim=-1)
+      h = h + means[..., None, None] * alpha[..., None, None]
+      out = gamma.view(-1, self.num_features, 1, 1) * h
+    return out

sgmse/backbones/ncsnpp_utils/op/__init__.py CHANGED Viewed

	@@ -1,2 +1 @@
1	- from .upfirdn2d import upfirdn2d
2	- # from .upfirdn2d_native import upfirdn2d


1	+ from .upfirdn2d import upfirdn2d

sgmse/backbones/ncsnpp_utils/op/fused_act.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import os
+import torch
+from torch import nn
+from torch.nn import functional as F
+from torch.autograd import Function
+from torch.utils.cpp_extension import load
+module_path = os.path.dirname(__file__)
+fused = load(
+    "fused",
+    sources=[
+        os.path.join(module_path, "fused_bias_act.cpp"),
+        os.path.join(module_path, "fused_bias_act_kernel.cu"),
+    ],
+)
+class FusedLeakyReLUFunctionBackward(Function):
+    @staticmethod
+    def forward(ctx, grad_output, out, negative_slope, scale):
+        ctx.save_for_backward(out)
+        ctx.negative_slope = negative_slope
+        ctx.scale = scale
+        empty = grad_output.new_empty(0)
+        grad_input = fused.fused_bias_act(
+            grad_output, empty, out, 3, 1, negative_slope, scale
+        )
+        dim = [0]
+        if grad_input.ndim > 2:
+            dim += list(range(2, grad_input.ndim))
+        grad_bias = grad_input.sum(dim).detach()
+        return grad_input, grad_bias
+    @staticmethod
+    def backward(ctx, gradgrad_input, gradgrad_bias):
+        out, = ctx.saved_tensors
+        gradgrad_out = fused.fused_bias_act(
+            gradgrad_input, gradgrad_bias, out, 3, 1, ctx.negative_slope, ctx.scale
+        )
+        return gradgrad_out, None, None, None
+class FusedLeakyReLUFunction(Function):
+    @staticmethod
+    def forward(ctx, input, bias, negative_slope, scale):
+        empty = input.new_empty(0)
+        out = fused.fused_bias_act(input, bias, empty, 3, 0, negative_slope, scale)
+        ctx.save_for_backward(out)
+        ctx.negative_slope = negative_slope
+        ctx.scale = scale
+        return out
+    @staticmethod
+    def backward(ctx, grad_output):
+        out, = ctx.saved_tensors
+        grad_input, grad_bias = FusedLeakyReLUFunctionBackward.apply(
+            grad_output, out, ctx.negative_slope, ctx.scale
+        )
+        return grad_input, grad_bias, None, None
+class FusedLeakyReLU(nn.Module):
+    def __init__(self, channel, negative_slope=0.2, scale=2 ** 0.5):
+        super().__init__()
+        self.bias = nn.Parameter(torch.zeros(channel))
+        self.negative_slope = negative_slope
+        self.scale = scale
+    def forward(self, input):
+        return fused_leaky_relu(input, self.bias, self.negative_slope, self.scale)
+def fused_leaky_relu(input, bias, negative_slope=0.2, scale=2 ** 0.5):
+    if input.device.type == "cpu":
+        rest_dim = [1] * (input.ndim - bias.ndim - 1)
+        return (
+            F.leaky_relu(
+                input + bias.view(1, bias.shape[0], *rest_dim), negative_slope=0.2
+            )
+            * scale
+        )
+    else:
+        return FusedLeakyReLUFunction.apply(input, bias, negative_slope, scale)

sgmse/backbones/ncsnpp_utils/op/fused_bias_act.cpp ADDED Viewed

	@@ -0,0 +1,21 @@

+#include <torch/extension.h>
+torch::Tensor fused_bias_act_op(const torch::Tensor& input, const torch::Tensor& bias, const torch::Tensor& refer,
+    int act, int grad, float alpha, float scale);
+#define CHECK_CUDA(x) TORCH_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
+torch::Tensor fused_bias_act(const torch::Tensor& input, const torch::Tensor& bias, const torch::Tensor& refer,
+    int act, int grad, float alpha, float scale) {
+    CHECK_CUDA(input);
+    CHECK_CUDA(bias);
+    return fused_bias_act_op(input, bias, refer, act, grad, alpha, scale);
+}
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.def("fused_bias_act", &fused_bias_act, "fused bias act (CUDA)");
+}

sgmse/backbones/ncsnpp_utils/op/fused_bias_act_kernel.cu ADDED Viewed

	@@ -0,0 +1,99 @@

+// Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
+//
+// This work is made available under the Nvidia Source Code License-NC.
+// To view a copy of this license, visit
+// https://nvlabs.github.io/stylegan2/license.html
+#include <torch/types.h>
+#include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/CUDAApplyUtils.cuh>
+#include <cuda.h>
+#include <cuda_runtime.h>
+template <typename scalar_t>
+static __global__ void fused_bias_act_kernel(scalar_t* out, const scalar_t* p_x, const scalar_t* p_b, const scalar_t* p_ref,
+    int act, int grad, scalar_t alpha, scalar_t scale, int loop_x, int size_x, int step_b, int size_b, int use_bias, int use_ref) {
+    int xi = blockIdx.x * loop_x * blockDim.x + threadIdx.x;
+    scalar_t zero = 0.0;
+    for (int loop_idx = 0; loop_idx < loop_x && xi < size_x; loop_idx++, xi += blockDim.x) {
+        scalar_t x = p_x[xi];
+        if (use_bias) {
+            x += p_b[(xi / step_b) % size_b];
+        }
+        scalar_t ref = use_ref ? p_ref[xi] : zero;
+        scalar_t y;
+        switch (act * 10 + grad) {
+            default:
+            case 10: y = x; break;
+            case 11: y = x; break;
+            case 12: y = 0.0; break;
+            case 30: y = (x > 0.0) ? x : x * alpha; break;
+            case 31: y = (ref > 0.0) ? x : x * alpha; break;
+            case 32: y = 0.0; break;
+        }
+        out[xi] = y * scale;
+    }
+}
+torch::Tensor fused_bias_act_op(const torch::Tensor& input, const torch::Tensor& bias, const torch::Tensor& refer,
+    int act, int grad, float alpha, float scale) {
+    int curDevice = -1;
+    cudaGetDevice(&curDevice);
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream(curDevice);
+    auto x = input.contiguous();
+    auto b = bias.contiguous();
+    auto ref = refer.contiguous();
+    int use_bias = b.numel() ? 1 : 0;
+    int use_ref = ref.numel() ? 1 : 0;
+    int size_x = x.numel();
+    int size_b = b.numel();
+    int step_b = 1;
+    for (int i = 1 + 1; i < x.dim(); i++) {
+        step_b *= x.size(i);
+    }
+    int loop_x = 4;
+    int block_size = 4 * 32;
+    int grid_size = (size_x - 1) / (loop_x * block_size) + 1;
+    auto y = torch::empty_like(x);
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(x.scalar_type(), "fused_bias_act_kernel", [&] {
+        fused_bias_act_kernel<scalar_t><<<grid_size, block_size, 0, stream>>>(
+            y.data_ptr<scalar_t>(),
+            x.data_ptr<scalar_t>(),
+            b.data_ptr<scalar_t>(),
+            ref.data_ptr<scalar_t>(),
+            act,
+            grad,
+            alpha,
+            scale,
+            loop_x,
+            size_x,
+            step_b,
+            size_b,
+            use_bias,
+            use_ref
+        );
+    });
+    return y;
+}

sgmse/backbones/ncsnpp_utils/up_or_down_sampling.py CHANGED Viewed

@@ -1,257 +1,257 @@
-"""Layers used for up-sampling or down-sampling images.
-Many functions are ported from https://github.com/NVlabs/stylegan2.
-"""
-import torch.nn as nn
-import torch
-import torch.nn.functional as F
-import numpy as np
-from .op import upfirdn2d
-# Function ported from StyleGAN2
-def get_weight(module,
-               shape,
-               weight_var='weight',
-               kernel_init=None):
-  """Get/create weight tensor for a convolution or fully-connected layer."""
-  return module.param(weight_var, kernel_init, shape)
-class Conv2d(nn.Module):
-  """Conv2d layer with optimal upsampling and downsampling (StyleGAN2)."""
-  def __init__(self, in_ch, out_ch, kernel, up=False, down=False,
-               resample_kernel=(1, 3, 3, 1),
-               use_bias=True,
-               kernel_init=None):
-    super().__init__()
-    assert not (up and down)
-    assert kernel >= 1 and kernel % 2 == 1
-    self.weight = nn.Parameter(torch.zeros(out_ch, in_ch, kernel, kernel))
-    if kernel_init is not None:
-      self.weight.data = kernel_init(self.weight.data.shape)
-    if use_bias:
-      self.bias = nn.Parameter(torch.zeros(out_ch))
-    self.up = up
-    self.down = down
-    self.resample_kernel = resample_kernel
-    self.kernel = kernel
-    self.use_bias = use_bias
-  def forward(self, x):
-    if self.up:
-      x = upsample_conv_2d(x, self.weight, k=self.resample_kernel)
-    elif self.down:
-      x = conv_downsample_2d(x, self.weight, k=self.resample_kernel)
-    else:
-      x = F.conv2d(x, self.weight, stride=1, padding=self.kernel // 2)
-    if self.use_bias:
-      x = x + self.bias.reshape(1, -1, 1, 1)
-    return x
-def naive_upsample_2d(x, factor=2):
-  _N, C, H, W = x.shape
-  x = torch.reshape(x, (-1, C, H, 1, W, 1))
-  x = x.repeat(1, 1, 1, factor, 1, factor)
-  return torch.reshape(x, (-1, C, H * factor, W * factor))
-def naive_downsample_2d(x, factor=2):
-  _N, C, H, W = x.shape
-  x = torch.reshape(x, (-1, C, H // factor, factor, W // factor, factor))
-  return torch.mean(x, dim=(3, 5))
-def upsample_conv_2d(x, w, k=None, factor=2, gain=1):
-  """Fused `upsample_2d()` followed by `tf.nn.conv2d()`.
-     Padding is performed only once at the beginning, not between the
-     operations.
-     The fused op is considerably more efficient than performing the same
-     calculation
-     using standard TensorFlow ops. It supports gradients of arbitrary order.
-     Args:
-       x:            Input tensor of the shape `[N, C, H, W]` or `[N, H, W,
-         C]`.
-       w:            Weight tensor of the shape `[filterH, filterW, inChannels,
-         outChannels]`. Grouped convolution can be performed by `inChannels =
-         x.shape[0] // numGroups`.
-       k:            FIR filter of the shape `[firH, firW]` or `[firN]`
-         (separable). The default is `[1] * factor`, which corresponds to
-         nearest-neighbor upsampling.
-       factor:       Integer upsampling factor (default: 2).
-       gain:         Scaling factor for signal magnitude (default: 1.0).
-     Returns:
-       Tensor of the shape `[N, C, H * factor, W * factor]` or
-       `[N, H * factor, W * factor, C]`, and same datatype as `x`.
-  """
-  assert isinstance(factor, int) and factor >= 1
-  # Check weight shape.
-  assert len(w.shape) == 4
-  convH = w.shape[2]
-  convW = w.shape[3]
-  inC = w.shape[1]
-  outC = w.shape[0]
-  assert convW == convH
-  # Setup filter kernel.
-  if k is None:
-    k = [1] * factor
-  k = _setup_kernel(k) * (gain * (factor ** 2))
-  p = (k.shape[0] - factor) - (convW - 1)
-  stride = (factor, factor)
-  # Determine data dimensions.
-  stride = [1, 1, factor, factor]
-  output_shape = ((_shape(x, 2) - 1) * factor + convH, (_shape(x, 3) - 1) * factor + convW)
-  output_padding = (output_shape[0] - (_shape(x, 2) - 1) * stride[0] - convH,
-                    output_shape[1] - (_shape(x, 3) - 1) * stride[1] - convW)
-  assert output_padding[0] >= 0 and output_padding[1] >= 0
-  num_groups = _shape(x, 1) // inC
-  # Transpose weights.
-  w = torch.reshape(w, (num_groups, -1, inC, convH, convW))
-  w = w[..., ::-1, ::-1].permute(0, 2, 1, 3, 4)
-  w = torch.reshape(w, (num_groups * inC, -1, convH, convW))
-  x = F.conv_transpose2d(x, w, stride=stride, output_padding=output_padding, padding=0)
-  ## Original TF code.
-  # x = tf.nn.conv2d_transpose(
-  #     x,
-  #     w,
-  #     output_shape=output_shape,
-  #     strides=stride,
-  #     padding='VALID',
-  #     data_format=data_format)
-  ## JAX equivalent
-  return upfirdn2d(x, torch.tensor(k, device=x.device),
-                   pad=((p + 1) // 2 + factor - 1, p // 2 + 1))
-def conv_downsample_2d(x, w, k=None, factor=2, gain=1):
-  """Fused `tf.nn.conv2d()` followed by `downsample_2d()`.
-    Padding is performed only once at the beginning, not between the operations.
-    The fused op is considerably more efficient than performing the same
-    calculation
-    using standard TensorFlow ops. It supports gradients of arbitrary order.
-    Args:
-        x:            Input tensor of the shape `[N, C, H, W]` or `[N, H, W,
-          C]`.
-        w:            Weight tensor of the shape `[filterH, filterW, inChannels,
-          outChannels]`. Grouped convolution can be performed by `inChannels =
-          x.shape[0] // numGroups`.
-        k:            FIR filter of the shape `[firH, firW]` or `[firN]`
-          (separable). The default is `[1] * factor`, which corresponds to
-          average pooling.
-        factor:       Integer downsampling factor (default: 2).
-        gain:         Scaling factor for signal magnitude (default: 1.0).
-    Returns:
-        Tensor of the shape `[N, C, H // factor, W // factor]` or
-        `[N, H // factor, W // factor, C]`, and same datatype as `x`.
-  """
-  assert isinstance(factor, int) and factor >= 1
-  _outC, _inC, convH, convW = w.shape
-  assert convW == convH
-  if k is None:
-    k = [1] * factor
-  k = _setup_kernel(k) * gain
-  p = (k.shape[0] - factor) + (convW - 1)
-  s = [factor, factor]
-  x = upfirdn2d(x, torch.tensor(k, device=x.device),
-                pad=((p + 1) // 2, p // 2))
-  return F.conv2d(x, w, stride=s, padding=0)
-def _setup_kernel(k):
-  k = np.asarray(k, dtype=np.float32)
-  if k.ndim == 1:
-    k = np.outer(k, k)
-  k /= np.sum(k)
-  assert k.ndim == 2
-  assert k.shape[0] == k.shape[1]
-  return k
-def _shape(x, dim):
-  return x.shape[dim]
-def upsample_2d(x, k=None, factor=2, gain=1):
-  r"""Upsample a batch of 2D images with the given filter.
-    Accepts a batch of 2D images of the shape `[N, C, H, W]` or `[N, H, W, C]`
-    and upsamples each image with the given filter. The filter is normalized so
-    that
-    if the input pixels are constant, they will be scaled by the specified
-    `gain`.
-    Pixels outside the image are assumed to be zero, and the filter is padded
-    with
-    zeros so that its shape is a multiple of the upsampling factor.
-    Args:
-        x:            Input tensor of the shape `[N, C, H, W]` or `[N, H, W,
-          C]`.
-        k:            FIR filter of the shape `[firH, firW]` or `[firN]`
-          (separable). The default is `[1] * factor`, which corresponds to
-          nearest-neighbor upsampling.
-        factor:       Integer upsampling factor (default: 2).
-        gain:         Scaling factor for signal magnitude (default: 1.0).
-    Returns:
-        Tensor of the shape `[N, C, H * factor, W * factor]`
-  """
-  assert isinstance(factor, int) and factor >= 1
-  if k is None:
-    k = [1] * factor
-  k = _setup_kernel(k) * (gain * (factor ** 2))
-  p = k.shape[0] - factor
-  return upfirdn2d(x, torch.tensor(k, device=x.device),
-                   up=factor, pad=((p + 1) // 2 + factor - 1, p // 2))
-def downsample_2d(x, k=None, factor=2, gain=1):
-  r"""Downsample a batch of 2D images with the given filter.
-    Accepts a batch of 2D images of the shape `[N, C, H, W]` or `[N, H, W, C]`
-    and downsamples each image with the given filter. The filter is normalized
-    so that
-    if the input pixels are constant, they will be scaled by the specified
-    `gain`.
-    Pixels outside the image are assumed to be zero, and the filter is padded
-    with
-    zeros so that its shape is a multiple of the downsampling factor.
-    Args:
-        x:            Input tensor of the shape `[N, C, H, W]` or `[N, H, W,
-          C]`.
-        k:            FIR filter of the shape `[firH, firW]` or `[firN]`
-          (separable). The default is `[1] * factor`, which corresponds to
-          average pooling.
-        factor:       Integer downsampling factor (default: 2).
-        gain:         Scaling factor for signal magnitude (default: 1.0).
-    Returns:
-        Tensor of the shape `[N, C, H // factor, W // factor]`
-  """
-  assert isinstance(factor, int) and factor >= 1
-  if k is None:
-    k = [1] * factor
-  k = _setup_kernel(k) * gain
-  p = k.shape[0] - factor
-  return upfirdn2d(x, torch.tensor(k, device=x.device),
-                   down=factor, pad=((p + 1) // 2, p // 2))

+"""Layers used for up-sampling or down-sampling images.
+Many functions are ported from https://github.com/NVlabs/stylegan2.
+"""
+import torch.nn as nn
+import torch
+import torch.nn.functional as F
+import numpy as np
+from .op import upfirdn2d
+# Function ported from StyleGAN2
+def get_weight(module,
+               shape,
+               weight_var='weight',
+               kernel_init=None):
+  """Get/create weight tensor for a convolution or fully-connected layer."""
+  return module.param(weight_var, kernel_init, shape)
+class Conv2d(nn.Module):
+  """Conv2d layer with optimal upsampling and downsampling (StyleGAN2)."""
+  def __init__(self, in_ch, out_ch, kernel, up=False, down=False,
+               resample_kernel=(1, 3, 3, 1),
+               use_bias=True,
+               kernel_init=None):
+    super().__init__()
+    assert not (up and down)
+    assert kernel >= 1 and kernel % 2 == 1
+    self.weight = nn.Parameter(torch.zeros(out_ch, in_ch, kernel, kernel))
+    if kernel_init is not None:
+      self.weight.data = kernel_init(self.weight.data.shape)
+    if use_bias:
+      self.bias = nn.Parameter(torch.zeros(out_ch))
+    self.up = up
+    self.down = down
+    self.resample_kernel = resample_kernel
+    self.kernel = kernel
+    self.use_bias = use_bias
+  def forward(self, x):
+    if self.up:
+      x = upsample_conv_2d(x, self.weight, k=self.resample_kernel)
+    elif self.down:
+      x = conv_downsample_2d(x, self.weight, k=self.resample_kernel)
+    else:
+      x = F.conv2d(x, self.weight, stride=1, padding=self.kernel // 2)
+    if self.use_bias:
+      x = x + self.bias.reshape(1, -1, 1, 1)
+    return x
+def naive_upsample_2d(x, factor=2):
+  _N, C, H, W = x.shape
+  x = torch.reshape(x, (-1, C, H, 1, W, 1))
+  x = x.repeat(1, 1, 1, factor, 1, factor)
+  return torch.reshape(x, (-1, C, H * factor, W * factor))
+def naive_downsample_2d(x, factor=2):
+  _N, C, H, W = x.shape
+  x = torch.reshape(x, (-1, C, H // factor, factor, W // factor, factor))
+  return torch.mean(x, dim=(3, 5))
+def upsample_conv_2d(x, w, k=None, factor=2, gain=1):
+  """Fused `upsample_2d()` followed by `tf.nn.conv2d()`.
+     Padding is performed only once at the beginning, not between the
+     operations.
+     The fused op is considerably more efficient than performing the same
+     calculation
+     using standard TensorFlow ops. It supports gradients of arbitrary order.
+     Args:
+       x:            Input tensor of the shape `[N, C, H, W]` or `[N, H, W,
+         C]`.
+       w:            Weight tensor of the shape `[filterH, filterW, inChannels,
+         outChannels]`. Grouped convolution can be performed by `inChannels =
+         x.shape[0] // numGroups`.
+       k:            FIR filter of the shape `[firH, firW]` or `[firN]`
+         (separable). The default is `[1] * factor`, which corresponds to
+         nearest-neighbor upsampling.
+       factor:       Integer upsampling factor (default: 2).
+       gain:         Scaling factor for signal magnitude (default: 1.0).
+     Returns:
+       Tensor of the shape `[N, C, H * factor, W * factor]` or
+       `[N, H * factor, W * factor, C]`, and same datatype as `x`.
+  """
+  assert isinstance(factor, int) and factor >= 1
+  # Check weight shape.
+  assert len(w.shape) == 4
+  convH = w.shape[2]
+  convW = w.shape[3]
+  inC = w.shape[1]
+  outC = w.shape[0]
+  assert convW == convH
+  # Setup filter kernel.
+  if k is None:
+    k = [1] * factor
+  k = _setup_kernel(k) * (gain * (factor ** 2))
+  p = (k.shape[0] - factor) - (convW - 1)
+  stride = (factor, factor)
+  # Determine data dimensions.
+  stride = [1, 1, factor, factor]
+  output_shape = ((_shape(x, 2) - 1) * factor + convH, (_shape(x, 3) - 1) * factor + convW)
+  output_padding = (output_shape[0] - (_shape(x, 2) - 1) * stride[0] - convH,
+                    output_shape[1] - (_shape(x, 3) - 1) * stride[1] - convW)
+  assert output_padding[0] >= 0 and output_padding[1] >= 0
+  num_groups = _shape(x, 1) // inC
+  # Transpose weights.
+  w = torch.reshape(w, (num_groups, -1, inC, convH, convW))
+  w = w[..., ::-1, ::-1].permute(0, 2, 1, 3, 4)
+  w = torch.reshape(w, (num_groups * inC, -1, convH, convW))
+  x = F.conv_transpose2d(x, w, stride=stride, output_padding=output_padding, padding=0)
+  ## Original TF code.
+  # x = tf.nn.conv2d_transpose(
+  #     x,
+  #     w,
+  #     output_shape=output_shape,
+  #     strides=stride,
+  #     padding='VALID',
+  #     data_format=data_format)
+  ## JAX equivalent
+  return upfirdn2d(x, torch.tensor(k, device=x.device),
+                   pad=((p + 1) // 2 + factor - 1, p // 2 + 1))
+def conv_downsample_2d(x, w, k=None, factor=2, gain=1):
+  """Fused `tf.nn.conv2d()` followed by `downsample_2d()`.
+    Padding is performed only once at the beginning, not between the operations.
+    The fused op is considerably more efficient than performing the same
+    calculation
+    using standard TensorFlow ops. It supports gradients of arbitrary order.
+    Args:
+        x:            Input tensor of the shape `[N, C, H, W]` or `[N, H, W,
+          C]`.
+        w:            Weight tensor of the shape `[filterH, filterW, inChannels,
+          outChannels]`. Grouped convolution can be performed by `inChannels =
+          x.shape[0] // numGroups`.
+        k:            FIR filter of the shape `[firH, firW]` or `[firN]`
+          (separable). The default is `[1] * factor`, which corresponds to
+          average pooling.
+        factor:       Integer downsampling factor (default: 2).
+        gain:         Scaling factor for signal magnitude (default: 1.0).
+    Returns:
+        Tensor of the shape `[N, C, H // factor, W // factor]` or
+        `[N, H // factor, W // factor, C]`, and same datatype as `x`.
+  """
+  assert isinstance(factor, int) and factor >= 1
+  _outC, _inC, convH, convW = w.shape
+  assert convW == convH
+  if k is None:
+    k = [1] * factor
+  k = _setup_kernel(k) * gain
+  p = (k.shape[0] - factor) + (convW - 1)
+  s = [factor, factor]
+  x = upfirdn2d(x, torch.tensor(k, device=x.device),
+                pad=((p + 1) // 2, p // 2))
+  return F.conv2d(x, w, stride=s, padding=0)
+def _setup_kernel(k):
+  k = np.asarray(k, dtype=np.float32)
+  if k.ndim == 1:
+    k = np.outer(k, k)
+  k /= np.sum(k)
+  assert k.ndim == 2
+  assert k.shape[0] == k.shape[1]
+  return k
+def _shape(x, dim):
+  return x.shape[dim]
+def upsample_2d(x, k=None, factor=2, gain=1):
+  r"""Upsample a batch of 2D images with the given filter.
+    Accepts a batch of 2D images of the shape `[N, C, H, W]` or `[N, H, W, C]`
+    and upsamples each image with the given filter. The filter is normalized so
+    that
+    if the input pixels are constant, they will be scaled by the specified
+    `gain`.
+    Pixels outside the image are assumed to be zero, and the filter is padded
+    with
+    zeros so that its shape is a multiple of the upsampling factor.
+    Args:
+        x:            Input tensor of the shape `[N, C, H, W]` or `[N, H, W,
+          C]`.
+        k:            FIR filter of the shape `[firH, firW]` or `[firN]`
+          (separable). The default is `[1] * factor`, which corresponds to
+          nearest-neighbor upsampling.
+        factor:       Integer upsampling factor (default: 2).
+        gain:         Scaling factor for signal magnitude (default: 1.0).
+    Returns:
+        Tensor of the shape `[N, C, H * factor, W * factor]`
+  """
+  assert isinstance(factor, int) and factor >= 1
+  if k is None:
+    k = [1] * factor
+  k = _setup_kernel(k) * (gain * (factor ** 2))
+  p = k.shape[0] - factor
+  return upfirdn2d(x, torch.tensor(k, device=x.device),
+                   up=factor, pad=((p + 1) // 2 + factor - 1, p // 2))
+def downsample_2d(x, k=None, factor=2, gain=1):
+  r"""Downsample a batch of 2D images with the given filter.
+    Accepts a batch of 2D images of the shape `[N, C, H, W]` or `[N, H, W, C]`
+    and downsamples each image with the given filter. The filter is normalized
+    so that
+    if the input pixels are constant, they will be scaled by the specified
+    `gain`.
+    Pixels outside the image are assumed to be zero, and the filter is padded
+    with
+    zeros so that its shape is a multiple of the downsampling factor.
+    Args:
+        x:            Input tensor of the shape `[N, C, H, W]` or `[N, H, W,
+          C]`.
+        k:            FIR filter of the shape `[firH, firW]` or `[firN]`
+          (separable). The default is `[1] * factor`, which corresponds to
+          average pooling.
+        factor:       Integer downsampling factor (default: 2).
+        gain:         Scaling factor for signal magnitude (default: 1.0).
+    Returns:
+        Tensor of the shape `[N, C, H // factor, W // factor]`
+  """
+  assert isinstance(factor, int) and factor >= 1
+  if k is None:
+    k = [1] * factor
+  k = _setup_kernel(k) * gain
+  p = k.shape[0] - factor
+  return upfirdn2d(x, torch.tensor(k, device=x.device),
+                   down=factor, pad=((p + 1) // 2, p // 2))

sgmse/backbones/ncsnpp_utils/utils.py CHANGED Viewed

@@ -1,189 +1,189 @@
-# coding=utf-8
-# Copyright 2020 The Google Research Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""All functions and modules related to model definition.
-"""
-import torch
-import numpy as np
-from ...sdes import OUVESDE, OUVPSDE
-_MODELS = {}
-def register_model(cls=None, *, name=None):
-  """A decorator for registering model classes."""
-  def _register(cls):
-    if name is None:
-      local_name = cls.__name__
-    else:
-      local_name = name
-    if local_name in _MODELS:
-      raise ValueError(f'Already registered model with name: {local_name}')
-    _MODELS[local_name] = cls
-    return cls
-  if cls is None:
-    return _register
-  else:
-    return _register(cls)
-def get_model(name):
-  return _MODELS[name]
-def get_sigmas(sigma_min, sigma_max, num_scales):
-  """Get sigmas --- the set of noise levels for SMLD from config files.
-  Args:
-    config: A ConfigDict object parsed from the config file
-  Returns:
-    sigmas: a jax numpy arrary of noise levels
-  """
-  sigmas = np.exp(
-    np.linspace(np.log(sigma_max), np.log(sigma_min), num_scales))
-  return sigmas
-def get_ddpm_params(config):
-  """Get betas and alphas --- parameters used in the original DDPM paper."""
-  num_diffusion_timesteps = 1000
-  # parameters need to be adapted if number of time steps differs from 1000
-  beta_start = config.model.beta_min / config.model.num_scales
-  beta_end = config.model.beta_max / config.model.num_scales
-  betas = np.linspace(beta_start, beta_end, num_diffusion_timesteps, dtype=np.float64)
-  alphas = 1. - betas
-  alphas_cumprod = np.cumprod(alphas, axis=0)
-  sqrt_alphas_cumprod = np.sqrt(alphas_cumprod)
-  sqrt_1m_alphas_cumprod = np.sqrt(1. - alphas_cumprod)
-  return {
-    'betas': betas,
-    'alphas': alphas,
-    'alphas_cumprod': alphas_cumprod,
-    'sqrt_alphas_cumprod': sqrt_alphas_cumprod,
-    'sqrt_1m_alphas_cumprod': sqrt_1m_alphas_cumprod,
-    'beta_min': beta_start * (num_diffusion_timesteps - 1),
-    'beta_max': beta_end * (num_diffusion_timesteps - 1),
-    'num_diffusion_timesteps': num_diffusion_timesteps
-  }
-def create_model(config):
-  """Create the score model."""
-  model_name = config.model.name
-  score_model = get_model(model_name)(config)
-  score_model = score_model.to(config.device)
-  score_model = torch.nn.DataParallel(score_model)
-  return score_model
-def get_model_fn(model, train=False):
-  """Create a function to give the output of the score-based model.
-  Args:
-    model: The score model.
-    train: `True` for training and `False` for evaluation.
-  Returns:
-    A model function.
-  """
-  def model_fn(x, labels):
-    """Compute the output of the score-based model.
-    Args:
-      x: A mini-batch of input data.
-      labels: A mini-batch of conditioning variables for time steps. Should be interpreted differently
-        for different models.
-    Returns:
-      A tuple of (model output, new mutable states)
-    """
-    if not train:
-      model.eval()
-      return model(x, labels)
-    else:
-      model.train()
-      return model(x, labels)
-  return model_fn
-def get_score_fn(sde, model, train=False, continuous=False):
-  """Wraps `score_fn` so that the model output corresponds to a real time-dependent score function.
-  Args:
-    sde: An `sde_lib.SDE` object that represents the forward SDE.
-    model: A score model.
-    train: `True` for training and `False` for evaluation.
-    continuous: If `True`, the score-based model is expected to directly take continuous time steps.
-  Returns:
-    A score function.
-  """
-  model_fn = get_model_fn(model, train=train)
-  if isinstance(sde, OUVPSDE):
-    def score_fn(x, t):
-      # Scale neural network output by standard deviation and flip sign
-      if continuous:
-        # For VP-trained models, t=0 corresponds to the lowest noise level
-        # The maximum value of time embedding is assumed to 999 for
-        # continuously-trained models.
-        labels = t * 999
-        score = model_fn(x, labels)
-        std = sde.marginal_prob(torch.zeros_like(x), t)[1]
-      else:
-        # For VP-trained models, t=0 corresponds to the lowest noise level
-        labels = t * (sde.N - 1)
-        score = model_fn(x, labels)
-        std = sde.sqrt_1m_alphas_cumprod.to(labels.device)[labels.long()]
-      score = -score / std[:, None, None, None]
-      return score
-  elif isinstance(sde, OUVESDE):
-    def score_fn(x, t):
-      if continuous:
-        labels = sde.marginal_prob(torch.zeros_like(x), t)[1]
-      else:
-        # For VE-trained models, t=0 corresponds to the highest noise level
-        labels = sde.T - t
-        labels *= sde.N - 1
-        labels = torch.round(labels).long()
-      score = model_fn(x, labels)
-      return score
-  else:
-    raise NotImplementedError(f"SDE class {sde.__class__.__name__} not yet supported.")
-  return score_fn
-def to_flattened_numpy(x):
-  """Flatten a torch tensor `x` and convert it to numpy."""
-  return x.detach().cpu().numpy().reshape((-1,))
-def from_flattened_numpy(x, shape):
-  """Form a torch tensor with the given `shape` from a flattened numpy array `x`."""
   return torch.from_numpy(x.reshape(shape))

+# coding=utf-8
+# Copyright 2020 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""All functions and modules related to model definition.
+"""
+import torch
+import numpy as np
+from ...sdes import OUVESDE, OUVPSDE
+_MODELS = {}
+def register_model(cls=None, *, name=None):
+  """A decorator for registering model classes."""
+  def _register(cls):
+    if name is None:
+      local_name = cls.__name__
+    else:
+      local_name = name
+    if local_name in _MODELS:
+      raise ValueError(f'Already registered model with name: {local_name}')
+    _MODELS[local_name] = cls
+    return cls
+  if cls is None:
+    return _register
+  else:
+    return _register(cls)
+def get_model(name):
+  return _MODELS[name]
+def get_sigmas(sigma_min, sigma_max, num_scales):
+  """Get sigmas --- the set of noise levels for SMLD from config files.
+  Args:
+    config: A ConfigDict object parsed from the config file
+  Returns:
+    sigmas: a jax numpy arrary of noise levels
+  """
+  sigmas = np.exp(
+    np.linspace(np.log(sigma_max), np.log(sigma_min), num_scales))
+  return sigmas
+def get_ddpm_params(config):
+  """Get betas and alphas --- parameters used in the original DDPM paper."""
+  num_diffusion_timesteps = 1000
+  # parameters need to be adapted if number of time steps differs from 1000
+  beta_start = config.model.beta_min / config.model.num_scales
+  beta_end = config.model.beta_max / config.model.num_scales
+  betas = np.linspace(beta_start, beta_end, num_diffusion_timesteps, dtype=np.float64)
+  alphas = 1. - betas
+  alphas_cumprod = np.cumprod(alphas, axis=0)
+  sqrt_alphas_cumprod = np.sqrt(alphas_cumprod)
+  sqrt_1m_alphas_cumprod = np.sqrt(1. - alphas_cumprod)
+  return {
+    'betas': betas,
+    'alphas': alphas,
+    'alphas_cumprod': alphas_cumprod,
+    'sqrt_alphas_cumprod': sqrt_alphas_cumprod,
+    'sqrt_1m_alphas_cumprod': sqrt_1m_alphas_cumprod,
+    'beta_min': beta_start * (num_diffusion_timesteps - 1),
+    'beta_max': beta_end * (num_diffusion_timesteps - 1),
+    'num_diffusion_timesteps': num_diffusion_timesteps
+  }
+def create_model(config):
+  """Create the score model."""
+  model_name = config.model.name
+  score_model = get_model(model_name)(config)
+  score_model = score_model.to(config.device)
+  score_model = torch.nn.DataParallel(score_model)
+  return score_model
+def get_model_fn(model, train=False):
+  """Create a function to give the output of the score-based model.
+  Args:
+    model: The score model.
+    train: `True` for training and `False` for evaluation.
+  Returns:
+    A model function.
+  """
+  def model_fn(x, labels):
+    """Compute the output of the score-based model.
+    Args:
+      x: A mini-batch of input data.
+      labels: A mini-batch of conditioning variables for time steps. Should be interpreted differently
+        for different models.
+    Returns:
+      A tuple of (model output, new mutable states)
+    """
+    if not train:
+      model.eval()
+      return model(x, labels)
+    else:
+      model.train()
+      return model(x, labels)
+  return model_fn
+def get_score_fn(sde, model, train=False, continuous=False):
+  """Wraps `score_fn` so that the model output corresponds to a real time-dependent score function.
+  Args:
+    sde: An `sde_lib.SDE` object that represents the forward SDE.
+    model: A score model.
+    train: `True` for training and `False` for evaluation.
+    continuous: If `True`, the score-based model is expected to directly take continuous time steps.
+  Returns:
+    A score function.
+  """
+  model_fn = get_model_fn(model, train=train)
+  if isinstance(sde, OUVPSDE):
+    def score_fn(x, t):
+      # Scale neural network output by standard deviation and flip sign
+      if continuous:
+        # For VP-trained models, t=0 corresponds to the lowest noise level
+        # The maximum value of time embedding is assumed to 999 for
+        # continuously-trained models.
+        labels = t * 999
+        score = model_fn(x, labels)
+        std = sde.marginal_prob(torch.zeros_like(x), t)[1]
+      else:
+        # For VP-trained models, t=0 corresponds to the lowest noise level
+        labels = t * (sde.N - 1)
+        score = model_fn(x, labels)
+        std = sde.sqrt_1m_alphas_cumprod.to(labels.device)[labels.long()]
+      score = -score / std[:, None, None, None]
+      return score
+  elif isinstance(sde, OUVESDE):
+    def score_fn(x, t):
+      if continuous:
+        labels = sde.marginal_prob(torch.zeros_like(x), t)[1]
+      else:
+        # For VE-trained models, t=0 corresponds to the highest noise level
+        labels = sde.T - t
+        labels *= sde.N - 1
+        labels = torch.round(labels).long()
+      score = model_fn(x, labels)
+      return score
+  else:
+    raise NotImplementedError(f"SDE class {sde.__class__.__name__} not yet supported.")
+  return score_fn
+def to_flattened_numpy(x):
+  """Flatten a torch tensor `x` and convert it to numpy."""
+  return x.detach().cpu().numpy().reshape((-1,))
+def from_flattened_numpy(x, shape):
+  """Form a torch tensor with the given `shape` from a flattened numpy array `x`."""
   return torch.from_numpy(x.reshape(shape))

sgmse/backbones/ncsnpp_v2.py CHANGED Viewed

@@ -1,395 +1,395 @@
-# coding=utf-8
-# Copyright 2020 The Google Research Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# pylint: skip-file
-from .ncsnpp_utils import layers, layerspp, normalization
-import torch.nn as nn
-import functools
-import torch
-import numpy as np
-from .shared import BackboneRegistry
-ResnetBlockDDPM = layerspp.ResnetBlockDDPMpp
-ResnetBlockBigGAN = layerspp.ResnetBlockBigGANpp
-Combine = layerspp.Combine
-conv3x3 = layerspp.conv3x3
-conv1x1 = layerspp.conv1x1
-get_act = layers.get_act
-get_normalization = normalization.get_normalization
-default_initializer = layers.default_init
-@BackboneRegistry.register("ncsnpp_v2")
-class NCSNpp_v2(nn.Module):
-    """NCSN++ model, adapted from https://github.com/yang-song/score_sde repository"""
-    @staticmethod
-    def add_argparse_args(parser):
-        parser.add_argument("--nf", type=int, default=128)
-        parser.add_argument("--ch_mult",type=int, nargs='+', default=[1,1,2,2,2,2,2])
-        parser.add_argument("--num_res_blocks", type=int, default=2)
-        parser.add_argument("--attn_resolutions", type=int, nargs='+', default=[16])
-        return parser
-    def __init__(self,
-        nf = 128,
-        ch_mult = (1, 1, 2, 2, 2, 2, 2),
-        num_res_blocks = 2,
-        attn_resolutions = (16,),
-        nonlinearity = 'swish',
-        resamp_with_conv = True,
-        fir = True,
-        fir_kernel = [1, 3, 3, 1],
-        skip_rescale = True,
-        resblock_type = 'biggan',
-        progressive = 'output_skip',
-        progressive_input = 'input_skip',
-        progressive_combine = 'sum',
-        init_scale = 0.,
-        fourier_scale = 16,
-        image_size = 256,
-        embedding_type = 'fourier',
-        dropout = .0,
-        **unused_kwargs
-    ):
-        super().__init__()
-        self.act = act = get_act(nonlinearity)
-        self.nf = nf = nf
-        ch_mult = ch_mult
-        self.num_res_blocks = num_res_blocks = num_res_blocks
-        self.attn_resolutions = attn_resolutions = attn_resolutions
-        self.num_resolutions = num_resolutions = len(ch_mult)
-        self.all_resolutions = all_resolutions = [image_size // (2 ** i) for i in range(num_resolutions)]
-        self.skip_rescale = skip_rescale = skip_rescale
-        self.resblock_type = resblock_type = resblock_type.lower()
-        self.progressive = progressive = progressive.lower()
-        self.progressive_input = progressive_input = progressive_input.lower()
-        self.embedding_type = embedding_type = embedding_type.lower()
-        assert progressive in ['none', 'output_skip', 'residual']
-        assert progressive_input in ['none', 'input_skip', 'residual']
-        assert embedding_type in ['fourier', 'positional']
-        combine_method = progressive_combine.lower()
-        combiner = functools.partial(Combine, method=combine_method)
-        in_channels = 4   # x.real, x.imag, y.real, y.imag
-        out_channels = 2  # score.real, score.imag
-        self.output_layer = nn.Conv2d(in_channels, out_channels, 1)
-        modules = []
-        # timestep/noise_level embedding
-        if embedding_type == 'fourier':
-            # Gaussian Fourier features embeddings.
-            modules.append(layerspp.GaussianFourierProjection(
-                embedding_size=nf, scale=fourier_scale
-            ))
-            embed_dim = 2 * nf
-        elif embedding_type == 'positional':
-            embed_dim = nf
-        else:
-            raise ValueError(f'embedding type {embedding_type} unknown.')
-        modules.append(nn.Linear(embed_dim, nf * 4))
-        modules[-1].weight.data = default_initializer()(modules[-1].weight.shape)
-        nn.init.zeros_(modules[-1].bias)
-        modules.append(nn.Linear(nf * 4, nf * 4))
-        modules[-1].weight.data = default_initializer()(modules[-1].weight.shape)
-        nn.init.zeros_(modules[-1].bias)
-        AttnBlock = functools.partial(layerspp.AttnBlockpp,
-            init_scale=init_scale, skip_rescale=skip_rescale)
-        Upsample = functools.partial(layerspp.Upsample,
-            with_conv=resamp_with_conv, fir=fir, fir_kernel=fir_kernel)
-        if progressive == 'output_skip':
-            self.pyramid_upsample = layerspp.Upsample(fir=fir, fir_kernel=fir_kernel, with_conv=False)
-        elif progressive == 'residual':
-            pyramid_upsample = functools.partial(layerspp.Upsample, fir=fir,
-                fir_kernel=fir_kernel, with_conv=True)
-        Downsample = functools.partial(layerspp.Downsample, with_conv=resamp_with_conv, fir=fir, fir_kernel=fir_kernel)
-        if progressive_input == 'input_skip':
-            self.pyramid_downsample = layerspp.Downsample(fir=fir, fir_kernel=fir_kernel, with_conv=False)
-        elif progressive_input == 'residual':
-            pyramid_downsample = functools.partial(layerspp.Downsample,
-                fir=fir, fir_kernel=fir_kernel, with_conv=True)
-        if resblock_type == 'ddpm':
-            ResnetBlock = functools.partial(ResnetBlockDDPM, act=act,
-                dropout=dropout, init_scale=init_scale,
-                skip_rescale=skip_rescale, temb_dim=nf * 4)
-        elif resblock_type == 'biggan':
-            ResnetBlock = functools.partial(ResnetBlockBigGAN, act=act,
-                dropout=dropout, fir=fir, fir_kernel=fir_kernel,
-                init_scale=init_scale, skip_rescale=skip_rescale, temb_dim=nf * 4)
-        else:
-            raise ValueError(f'resblock type {resblock_type} unrecognized.')
-        # Downsampling block
-        channels = in_channels
-        if progressive_input != 'none':
-            input_pyramid_ch = channels
-        modules.append(conv3x3(channels, nf))
-        hs_c = [nf]
-        in_ch = nf
-        for i_level in range(num_resolutions):
-            # Residual blocks for this resolution
-            for i_block in range(num_res_blocks):
-                out_ch = nf * ch_mult[i_level]
-                modules.append(ResnetBlock(in_ch=in_ch, out_ch=out_ch))
-                in_ch = out_ch
-                if all_resolutions[i_level] in attn_resolutions:
-                    modules.append(AttnBlock(channels=in_ch))
-                hs_c.append(in_ch)
-            if i_level != num_resolutions - 1:
-                if resblock_type == 'ddpm':
-                    modules.append(Downsample(in_ch=in_ch))
-                else:
-                    modules.append(ResnetBlock(down=True, in_ch=in_ch))
-                if progressive_input == 'input_skip':
-                    modules.append(combiner(dim1=input_pyramid_ch, dim2=in_ch))
-                    if combine_method == 'cat':
-                        in_ch *= 2
-                elif progressive_input == 'residual':
-                    modules.append(pyramid_downsample(in_ch=input_pyramid_ch, out_ch=in_ch))
-                    input_pyramid_ch = in_ch
-                hs_c.append(in_ch)
-        in_ch = hs_c[-1]
-        modules.append(ResnetBlock(in_ch=in_ch))
-        modules.append(AttnBlock(channels=in_ch))
-        modules.append(ResnetBlock(in_ch=in_ch))
-        pyramid_ch = 0
-        # Upsampling block
-        for i_level in reversed(range(num_resolutions)):
-            for i_block in range(num_res_blocks + 1):  # +1 blocks in upsampling because of skip connection from combiner (after downsampling)
-                out_ch = nf * ch_mult[i_level]
-                modules.append(ResnetBlock(in_ch=in_ch + hs_c.pop(), out_ch=out_ch))
-                in_ch = out_ch
-            if all_resolutions[i_level] in attn_resolutions:
-                modules.append(AttnBlock(channels=in_ch))
-            if progressive != 'none':
-                if i_level == num_resolutions - 1:
-                    if progressive == 'output_skip':
-                        modules.append(nn.GroupNorm(num_groups=min(in_ch // 4, 32),
-                            num_channels=in_ch, eps=1e-6))
-                        modules.append(conv3x3(in_ch, channels, init_scale=init_scale))
-                        pyramid_ch = channels
-                    elif progressive == 'residual':
-                        modules.append(nn.GroupNorm(num_groups=min(in_ch // 4, 32), num_channels=in_ch, eps=1e-6))
-                        modules.append(conv3x3(in_ch, in_ch, bias=True))
-                        pyramid_ch = in_ch
-                    else:
-                        raise ValueError(f'{progressive} is not a valid name.')
-                else:
-                    if progressive == 'output_skip':
-                        modules.append(nn.GroupNorm(num_groups=min(in_ch // 4, 32),
-                            num_channels=in_ch, eps=1e-6))
-                        modules.append(conv3x3(in_ch, channels, bias=True, init_scale=init_scale))
-                        pyramid_ch = channels
-                    elif progressive == 'residual':
-                        modules.append(pyramid_upsample(in_ch=pyramid_ch, out_ch=in_ch))
-                        pyramid_ch = in_ch
-                    else:
-                        raise ValueError(f'{progressive} is not a valid name')
-            if i_level != 0:
-                if resblock_type == 'ddpm':
-                    modules.append(Upsample(in_ch=in_ch))
-                else:
-                    modules.append(ResnetBlock(in_ch=in_ch, up=True))
-        assert not hs_c
-        if progressive != 'output_skip':
-            modules.append(nn.GroupNorm(num_groups=min(in_ch // 4, 32), num_channels=in_ch, eps=1e-6))
-            modules.append(conv3x3(in_ch, channels, init_scale=init_scale))
-        self.all_modules = nn.ModuleList(modules)
-    def forward(self, x, y, t):
-        # timestep/noise_level embedding; only for continuous training
-        modules = self.all_modules
-        m_idx = 0
-        # Convert real and imaginary parts of (x,y) into four channel dimensions
-        x = torch.cat((x.real, x.imag, y.real, y.imag), dim=1)
-        if self.embedding_type == 'fourier':
-            # Gaussian Fourier features embeddings.
-            used_sigmas = t
-            temb = modules[m_idx](torch.log(used_sigmas))
-            m_idx += 1
-        elif self.embedding_type == 'positional':
-            # Sinusoidal positional embeddings.
-            timesteps = t
-            used_sigmas = self.sigmas[t.long()]
-            temb = layers.get_timestep_embedding(timesteps, self.nf)
-        else:
-            raise ValueError(f'embedding type {self.embedding_type} unknown.')
-        temb = modules[m_idx](temb)
-        m_idx += 1
-        temb = modules[m_idx](self.act(temb))
-        m_idx += 1
-        # Downsampling block
-        input_pyramid = None
-        if self.progressive_input != 'none':
-            input_pyramid = x
-        # Input layer: Conv2d: 4ch -> 128ch
-        hs = [modules[m_idx](x)]
-        m_idx += 1
-        # Down path in U-Net
-        for i_level in range(self.num_resolutions):
-            # Residual blocks for this resolution
-            for i_block in range(self.num_res_blocks):
-                h = modules[m_idx](hs[-1], temb)
-                m_idx += 1
-                # Attention layer (optional)
-                if h.shape[-2] in self.attn_resolutions: # edit: check H dim (-2) not W dim (-1)
-                    h = modules[m_idx](h)
-                    m_idx += 1
-                hs.append(h)
-            # Downsampling
-            if i_level != self.num_resolutions - 1:
-                if self.resblock_type == 'ddpm':
-                    h = modules[m_idx](hs[-1])
-                    m_idx += 1
-                else:
-                    h = modules[m_idx](hs[-1], temb)
-                    m_idx += 1
-                if self.progressive_input == 'input_skip':   # Combine h with x
-                    input_pyramid = self.pyramid_downsample(input_pyramid)
-                    h = modules[m_idx](input_pyramid, h)
-                    m_idx += 1
-                elif self.progressive_input == 'residual':
-                    input_pyramid = modules[m_idx](input_pyramid)
-                    m_idx += 1
-                    if self.skip_rescale:
-                        input_pyramid = (input_pyramid + h) / np.sqrt(2.)
-                    else:
-                        input_pyramid = input_pyramid + h
-                    h = input_pyramid
-                hs.append(h)
-        h = hs[-1] # actualy equal to: h = h
-        h = modules[m_idx](h, temb)  # ResNet block
-        m_idx += 1
-        h = modules[m_idx](h)  # Attention block
-        m_idx += 1
-        h = modules[m_idx](h, temb)  # ResNet block
-        m_idx += 1
-        pyramid = None
-        # Upsampling block
-        for i_level in reversed(range(self.num_resolutions)):
-            for i_block in range(self.num_res_blocks + 1):
-                h = modules[m_idx](torch.cat([h, hs.pop()], dim=1), temb)
-                m_idx += 1
-            # edit: from -1 to -2
-            if h.shape[-2] in self.attn_resolutions:
-                h = modules[m_idx](h)
-                m_idx += 1
-            if self.progressive != 'none':
-                if i_level == self.num_resolutions - 1:
-                    if self.progressive == 'output_skip':
-                        pyramid = self.act(modules[m_idx](h))  # GroupNorm
-                        m_idx += 1
-                        pyramid = modules[m_idx](pyramid)  # Conv2D: 256 -> 4
-                        m_idx += 1
-                    elif self.progressive == 'residual':
-                        pyramid = self.act(modules[m_idx](h))
-                        m_idx += 1
-                        pyramid = modules[m_idx](pyramid)
-                        m_idx += 1
-                    else:
-                        raise ValueError(f'{self.progressive} is not a valid name.')
-                else:
-                    if self.progressive == 'output_skip':
-                        pyramid = self.pyramid_upsample(pyramid)  # Upsample
-                        pyramid_h = self.act(modules[m_idx](h))  # GroupNorm
-                        m_idx += 1
-                        pyramid_h = modules[m_idx](pyramid_h)
-                        m_idx += 1
-                        pyramid = pyramid + pyramid_h
-                    elif self.progressive == 'residual':
-                        pyramid = modules[m_idx](pyramid)
-                        m_idx += 1
-                        if self.skip_rescale:
-                            pyramid = (pyramid + h) / np.sqrt(2.)
-                        else:
-                            pyramid = pyramid + h
-                        h = pyramid
-                    else:
-                        raise ValueError(f'{self.progressive} is not a valid name')
-            # Upsampling Layer
-            if i_level != 0:
-                if self.resblock_type == 'ddpm':
-                    h = modules[m_idx](h)
-                    m_idx += 1
-                else:
-                    h = modules[m_idx](h, temb)  # Upspampling
-                    m_idx += 1
-        assert not hs
-        if self.progressive == 'output_skip':
-            h = pyramid
-        else:
-            h = self.act(modules[m_idx](h))
-            m_idx += 1
-            h = modules[m_idx](h)
-            m_idx += 1
-        assert m_idx == len(modules), "Implementation error"
-        h = self.output_layer(h)
-        h = torch.permute(h, (0, 2, 3, 1)).contiguous()
-        # Convert back to complex number
-        h = torch.view_as_complex(h)[:,None, :, :]
-        return h

+# coding=utf-8
+# Copyright 2020 The Google Research Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# pylint: skip-file
+from .ncsnpp_utils import layers, layerspp, normalization
+import torch.nn as nn
+import functools
+import torch
+import numpy as np
+from .shared import BackboneRegistry
+ResnetBlockDDPM = layerspp.ResnetBlockDDPMpp
+ResnetBlockBigGAN = layerspp.ResnetBlockBigGANpp
+Combine = layerspp.Combine
+conv3x3 = layerspp.conv3x3
+conv1x1 = layerspp.conv1x1
+get_act = layers.get_act
+get_normalization = normalization.get_normalization
+default_initializer = layers.default_init
+@BackboneRegistry.register("ncsnpp_v2")
+class NCSNpp_v2(nn.Module):
+    """NCSN++ model, adapted from https://github.com/yang-song/score_sde repository"""
+    @staticmethod
+    def add_argparse_args(parser):
+        parser.add_argument("--nf", type=int, default=128)
+        parser.add_argument("--ch_mult",type=int, nargs='+', default=[1,1,2,2,2,2,2])
+        parser.add_argument("--num_res_blocks", type=int, default=2)
+        parser.add_argument("--attn_resolutions", type=int, nargs='+', default=[16])
+        return parser
+    def __init__(self,
+        nf = 128,
+        ch_mult = (1, 1, 2, 2, 2, 2, 2),
+        num_res_blocks = 2,
+        attn_resolutions = (16,),
+        nonlinearity = 'swish',
+        resamp_with_conv = True,
+        fir = True,
+        fir_kernel = [1, 3, 3, 1],
+        skip_rescale = True,
+        resblock_type = 'biggan',
+        progressive = 'output_skip',
+        progressive_input = 'input_skip',
+        progressive_combine = 'sum',
+        init_scale = 0.,
+        fourier_scale = 16,
+        image_size = 256,
+        embedding_type = 'fourier',
+        dropout = .0,
+        **unused_kwargs
+    ):
+        super().__init__()
+        self.act = act = get_act(nonlinearity)
+        self.nf = nf = nf
+        ch_mult = ch_mult
+        self.num_res_blocks = num_res_blocks = num_res_blocks
+        self.attn_resolutions = attn_resolutions = attn_resolutions
+        self.num_resolutions = num_resolutions = len(ch_mult)
+        self.all_resolutions = all_resolutions = [image_size // (2 ** i) for i in range(num_resolutions)]
+        self.skip_rescale = skip_rescale = skip_rescale
+        self.resblock_type = resblock_type = resblock_type.lower()
+        self.progressive = progressive = progressive.lower()
+        self.progressive_input = progressive_input = progressive_input.lower()
+        self.embedding_type = embedding_type = embedding_type.lower()
+        assert progressive in ['none', 'output_skip', 'residual']
+        assert progressive_input in ['none', 'input_skip', 'residual']
+        assert embedding_type in ['fourier', 'positional']
+        combine_method = progressive_combine.lower()
+        combiner = functools.partial(Combine, method=combine_method)
+        in_channels = 4   # x.real, x.imag, y.real, y.imag
+        out_channels = 2  # score.real, score.imag
+        self.output_layer = nn.Conv2d(in_channels, out_channels, 1)
+        modules = []
+        # timestep/noise_level embedding
+        if embedding_type == 'fourier':
+            # Gaussian Fourier features embeddings.
+            modules.append(layerspp.GaussianFourierProjection(
+                embedding_size=nf, scale=fourier_scale
+            ))
+            embed_dim = 2 * nf
+        elif embedding_type == 'positional':
+            embed_dim = nf
+        else:
+            raise ValueError(f'embedding type {embedding_type} unknown.')
+        modules.append(nn.Linear(embed_dim, nf * 4))
+        modules[-1].weight.data = default_initializer()(modules[-1].weight.shape)
+        nn.init.zeros_(modules[-1].bias)
+        modules.append(nn.Linear(nf * 4, nf * 4))
+        modules[-1].weight.data = default_initializer()(modules[-1].weight.shape)
+        nn.init.zeros_(modules[-1].bias)
+        AttnBlock = functools.partial(layerspp.AttnBlockpp,
+            init_scale=init_scale, skip_rescale=skip_rescale)
+        Upsample = functools.partial(layerspp.Upsample,
+            with_conv=resamp_with_conv, fir=fir, fir_kernel=fir_kernel)
+        if progressive == 'output_skip':
+            self.pyramid_upsample = layerspp.Upsample(fir=fir, fir_kernel=fir_kernel, with_conv=False)
+        elif progressive == 'residual':
+            pyramid_upsample = functools.partial(layerspp.Upsample, fir=fir,
+                fir_kernel=fir_kernel, with_conv=True)
+        Downsample = functools.partial(layerspp.Downsample, with_conv=resamp_with_conv, fir=fir, fir_kernel=fir_kernel)
+        if progressive_input == 'input_skip':
+            self.pyramid_downsample = layerspp.Downsample(fir=fir, fir_kernel=fir_kernel, with_conv=False)
+        elif progressive_input == 'residual':
+            pyramid_downsample = functools.partial(layerspp.Downsample,
+                fir=fir, fir_kernel=fir_kernel, with_conv=True)
+        if resblock_type == 'ddpm':
+            ResnetBlock = functools.partial(ResnetBlockDDPM, act=act,
+                dropout=dropout, init_scale=init_scale,
+                skip_rescale=skip_rescale, temb_dim=nf * 4)
+        elif resblock_type == 'biggan':
+            ResnetBlock = functools.partial(ResnetBlockBigGAN, act=act,
+                dropout=dropout, fir=fir, fir_kernel=fir_kernel,
+                init_scale=init_scale, skip_rescale=skip_rescale, temb_dim=nf * 4)
+        else:
+            raise ValueError(f'resblock type {resblock_type} unrecognized.')
+        # Downsampling block
+        channels = in_channels
+        if progressive_input != 'none':
+            input_pyramid_ch = channels
+        modules.append(conv3x3(channels, nf))
+        hs_c = [nf]
+        in_ch = nf
+        for i_level in range(num_resolutions):
+            # Residual blocks for this resolution
+            for i_block in range(num_res_blocks):
+                out_ch = nf * ch_mult[i_level]
+                modules.append(ResnetBlock(in_ch=in_ch, out_ch=out_ch))
+                in_ch = out_ch
+                if all_resolutions[i_level] in attn_resolutions:
+                    modules.append(AttnBlock(channels=in_ch))
+                hs_c.append(in_ch)
+            if i_level != num_resolutions - 1:
+                if resblock_type == 'ddpm':
+                    modules.append(Downsample(in_ch=in_ch))
+                else:
+                    modules.append(ResnetBlock(down=True, in_ch=in_ch))
+                if progressive_input == 'input_skip':
+                    modules.append(combiner(dim1=input_pyramid_ch, dim2=in_ch))
+                    if combine_method == 'cat':
+                        in_ch *= 2
+                elif progressive_input == 'residual':
+                    modules.append(pyramid_downsample(in_ch=input_pyramid_ch, out_ch=in_ch))
+                    input_pyramid_ch = in_ch
+                hs_c.append(in_ch)
+        in_ch = hs_c[-1]
+        modules.append(ResnetBlock(in_ch=in_ch))
+        modules.append(AttnBlock(channels=in_ch))
+        modules.append(ResnetBlock(in_ch=in_ch))
+        pyramid_ch = 0
+        # Upsampling block
+        for i_level in reversed(range(num_resolutions)):
+            for i_block in range(num_res_blocks + 1):  # +1 blocks in upsampling because of skip connection from combiner (after downsampling)
+                out_ch = nf * ch_mult[i_level]
+                modules.append(ResnetBlock(in_ch=in_ch + hs_c.pop(), out_ch=out_ch))
+                in_ch = out_ch
+            if all_resolutions[i_level] in attn_resolutions:
+                modules.append(AttnBlock(channels=in_ch))
+            if progressive != 'none':
+                if i_level == num_resolutions - 1:
+                    if progressive == 'output_skip':
+                        modules.append(nn.GroupNorm(num_groups=min(in_ch // 4, 32),
+                            num_channels=in_ch, eps=1e-6))
+                        modules.append(conv3x3(in_ch, channels, init_scale=init_scale))
+                        pyramid_ch = channels
+                    elif progressive == 'residual':
+                        modules.append(nn.GroupNorm(num_groups=min(in_ch // 4, 32), num_channels=in_ch, eps=1e-6))
+                        modules.append(conv3x3(in_ch, in_ch, bias=True))
+                        pyramid_ch = in_ch
+                    else:
+                        raise ValueError(f'{progressive} is not a valid name.')
+                else:
+                    if progressive == 'output_skip':
+                        modules.append(nn.GroupNorm(num_groups=min(in_ch // 4, 32),
+                            num_channels=in_ch, eps=1e-6))
+                        modules.append(conv3x3(in_ch, channels, bias=True, init_scale=init_scale))
+                        pyramid_ch = channels
+                    elif progressive == 'residual':
+                        modules.append(pyramid_upsample(in_ch=pyramid_ch, out_ch=in_ch))
+                        pyramid_ch = in_ch
+                    else:
+                        raise ValueError(f'{progressive} is not a valid name')
+            if i_level != 0:
+                if resblock_type == 'ddpm':
+                    modules.append(Upsample(in_ch=in_ch))
+                else:
+                    modules.append(ResnetBlock(in_ch=in_ch, up=True))
+        assert not hs_c
+        if progressive != 'output_skip':
+            modules.append(nn.GroupNorm(num_groups=min(in_ch // 4, 32), num_channels=in_ch, eps=1e-6))
+            modules.append(conv3x3(in_ch, channels, init_scale=init_scale))
+        self.all_modules = nn.ModuleList(modules)
+    def forward(self, x, y, t):
+        # timestep/noise_level embedding; only for continuous training
+        modules = self.all_modules
+        m_idx = 0
+        # Convert real and imaginary parts of (x,y) into four channel dimensions
+        x = torch.cat((x.real, x.imag, y.real, y.imag), dim=1)
+        if self.embedding_type == 'fourier':
+            # Gaussian Fourier features embeddings.
+            used_sigmas = t
+            temb = modules[m_idx](torch.log(used_sigmas))
+            m_idx += 1
+        elif self.embedding_type == 'positional':
+            # Sinusoidal positional embeddings.
+            timesteps = t
+            used_sigmas = self.sigmas[t.long()]
+            temb = layers.get_timestep_embedding(timesteps, self.nf)
+        else:
+            raise ValueError(f'embedding type {self.embedding_type} unknown.')
+        temb = modules[m_idx](temb)
+        m_idx += 1
+        temb = modules[m_idx](self.act(temb))
+        m_idx += 1
+        # Downsampling block
+        input_pyramid = None
+        if self.progressive_input != 'none':
+            input_pyramid = x
+        # Input layer: Conv2d: 4ch -> 128ch
+        hs = [modules[m_idx](x)]
+        m_idx += 1
+        # Down path in U-Net
+        for i_level in range(self.num_resolutions):
+            # Residual blocks for this resolution
+            for i_block in range(self.num_res_blocks):
+                h = modules[m_idx](hs[-1], temb)
+                m_idx += 1
+                # Attention layer (optional)
+                if h.shape[-2] in self.attn_resolutions: # edit: check H dim (-2) not W dim (-1)
+                    h = modules[m_idx](h)
+                    m_idx += 1
+                hs.append(h)
+            # Downsampling
+            if i_level != self.num_resolutions - 1:
+                if self.resblock_type == 'ddpm':
+                    h = modules[m_idx](hs[-1])
+                    m_idx += 1
+                else:
+                    h = modules[m_idx](hs[-1], temb)
+                    m_idx += 1
+                if self.progressive_input == 'input_skip':   # Combine h with x
+                    input_pyramid = self.pyramid_downsample(input_pyramid)
+                    h = modules[m_idx](input_pyramid, h)
+                    m_idx += 1
+                elif self.progressive_input == 'residual':
+                    input_pyramid = modules[m_idx](input_pyramid)
+                    m_idx += 1
+                    if self.skip_rescale:
+                        input_pyramid = (input_pyramid + h) / np.sqrt(2.)
+                    else:
+                        input_pyramid = input_pyramid + h
+                    h = input_pyramid
+                hs.append(h)
+        h = hs[-1] # actualy equal to: h = h
+        h = modules[m_idx](h, temb)  # ResNet block
+        m_idx += 1
+        h = modules[m_idx](h)  # Attention block
+        m_idx += 1
+        h = modules[m_idx](h, temb)  # ResNet block
+        m_idx += 1
+        pyramid = None
+        # Upsampling block
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks + 1):
+                h = modules[m_idx](torch.cat([h, hs.pop()], dim=1), temb)
+                m_idx += 1
+            # edit: from -1 to -2
+            if h.shape[-2] in self.attn_resolutions:
+                h = modules[m_idx](h)
+                m_idx += 1
+            if self.progressive != 'none':
+                if i_level == self.num_resolutions - 1:
+                    if self.progressive == 'output_skip':
+                        pyramid = self.act(modules[m_idx](h))  # GroupNorm
+                        m_idx += 1
+                        pyramid = modules[m_idx](pyramid)  # Conv2D: 256 -> 4
+                        m_idx += 1
+                    elif self.progressive == 'residual':
+                        pyramid = self.act(modules[m_idx](h))
+                        m_idx += 1
+                        pyramid = modules[m_idx](pyramid)
+                        m_idx += 1
+                    else:
+                        raise ValueError(f'{self.progressive} is not a valid name.')
+                else:
+                    if self.progressive == 'output_skip':
+                        pyramid = self.pyramid_upsample(pyramid)  # Upsample
+                        pyramid_h = self.act(modules[m_idx](h))  # GroupNorm
+                        m_idx += 1
+                        pyramid_h = modules[m_idx](pyramid_h)
+                        m_idx += 1
+                        pyramid = pyramid + pyramid_h
+                    elif self.progressive == 'residual':
+                        pyramid = modules[m_idx](pyramid)
+                        m_idx += 1
+                        if self.skip_rescale:
+                            pyramid = (pyramid + h) / np.sqrt(2.)
+                        else:
+                            pyramid = pyramid + h
+                        h = pyramid
+                    else:
+                        raise ValueError(f'{self.progressive} is not a valid name')
+            # Upsampling Layer
+            if i_level != 0:
+                if self.resblock_type == 'ddpm':
+                    h = modules[m_idx](h)
+                    m_idx += 1
+                else:
+                    h = modules[m_idx](h, temb)  # Upspampling
+                    m_idx += 1
+        assert not hs
+        if self.progressive == 'output_skip':
+            h = pyramid
+        else:
+            h = self.act(modules[m_idx](h))
+            m_idx += 1
+            h = modules[m_idx](h)
+            m_idx += 1
+        assert m_idx == len(modules), "Implementation error"
+        h = self.output_layer(h)
+        h = torch.permute(h, (0, 2, 3, 1)).contiguous()
+        # Convert back to complex number
+        h = torch.view_as_complex(h)[:,None, :, :]
+        return h

sgmse/backbones/shared.py CHANGED Viewed

@@ -1,123 +1,123 @@
-import functools
-import numpy as np
-import torch
-import torch.nn as nn
-from sgmse.util.registry import Registry
-BackboneRegistry = Registry("Backbone")
-class GaussianFourierProjection(nn.Module):
-    """Gaussian random features for encoding time steps."""
-    def __init__(self, embed_dim, scale=16, complex_valued=False):
-        super().__init__()
-        self.complex_valued = complex_valued
-        if not complex_valued:
-            # If the output is real-valued, we concatenate sin+cos of the features to avoid ambiguities.
-            # Therefore, in this case the effective embed_dim is cut in half. For the complex-valued case,
-            # we use complex numbers which each represent sin+cos directly, so the ambiguity is avoided directly,
-            # and this halving is not necessary.
-            embed_dim = embed_dim // 2
-        # Randomly sample weights during initialization. These weights are fixed
-        # during optimization and are not trainable.
-        self.W = nn.Parameter(torch.randn(embed_dim) * scale, requires_grad=False)
-    def forward(self, t):
-        t_proj = t[:, None] * self.W[None, :] * 2*np.pi
-        if self.complex_valued:
-            return torch.exp(1j * t_proj)
-        else:
-            return torch.cat([torch.sin(t_proj), torch.cos(t_proj)], dim=-1)
-class DiffusionStepEmbedding(nn.Module):
-    """Diffusion-Step embedding as in DiffWave / Vaswani et al. 2017."""
-    def __init__(self, embed_dim, complex_valued=False):
-        super().__init__()
-        self.complex_valued = complex_valued
-        if not complex_valued:
-            # If the output is real-valued, we concatenate sin+cos of the features to avoid ambiguities.
-            # Therefore, in this case the effective embed_dim is cut in half. For the complex-valued case,
-            # we use complex numbers which each represent sin+cos directly, so the ambiguity is avoided directly,
-            # and this halving is not necessary.
-            embed_dim = embed_dim // 2
-        self.embed_dim = embed_dim
-    def forward(self, t):
-        fac = 10**(4*torch.arange(self.embed_dim, device=t.device) / (self.embed_dim-1))
-        inner = t[:, None] * fac[None, :]
-        if self.complex_valued:
-            return torch.exp(1j * inner)
-        else:
-            return torch.cat([torch.sin(inner), torch.cos(inner)], dim=-1)
-class ComplexLinear(nn.Module):
-    """A potentially complex-valued linear layer. Reduces to a regular linear layer if `complex_valued=False`."""
-    def __init__(self, input_dim, output_dim, complex_valued):
-        super().__init__()
-        self.complex_valued = complex_valued
-        if self.complex_valued:
-            self.re = nn.Linear(input_dim, output_dim)
-            self.im = nn.Linear(input_dim, output_dim)
-        else:
-            self.lin = nn.Linear(input_dim, output_dim)
-    def forward(self, x):
-        if self.complex_valued:
-            return (self.re(x.real) - self.im(x.imag)) + 1j*(self.re(x.imag) + self.im(x.real))
-        else:
-            return self.lin(x)
-class FeatureMapDense(nn.Module):
-    """A fully connected layer that reshapes outputs to feature maps."""
-    def __init__(self, input_dim, output_dim, complex_valued=False):
-        super().__init__()
-        self.complex_valued = complex_valued
-        self.dense = ComplexLinear(input_dim, output_dim, complex_valued=complex_valued)
-    def forward(self, x):
-        return self.dense(x)[..., None, None]
-def torch_complex_from_reim(re, im):
-    return torch.view_as_complex(torch.stack([re, im], dim=-1))
-class ArgsComplexMultiplicationWrapper(nn.Module):
-    """Adapted from `asteroid`'s `complex_nn.py`, allowing args/kwargs to be passed through forward().
-    Make a complex-valued module `F` from a real-valued module `f` by applying
-    complex multiplication rules:
-    F(a + i b) = f1(a) - f1(b) + i (f2(b) + f2(a))
-    where `f1`, `f2` are instances of `f` that do *not* share weights.
-    Args:
-        module_cls (callable): A class or function that returns a Torch module/functional.
-            Constructor of `f` in the formula above.  Called 2x with `*args`, `**kwargs`,
-            to construct the real and imaginary component modules.
-    """
-    def __init__(self, module_cls, *args, **kwargs):
-        super().__init__()
-        self.re_module = module_cls(*args, **kwargs)
-        self.im_module = module_cls(*args, **kwargs)
-    def forward(self, x, *args, **kwargs):
-        return torch_complex_from_reim(
-            self.re_module(x.real, *args, **kwargs) - self.im_module(x.imag, *args, **kwargs),
-            self.re_module(x.imag, *args, **kwargs) + self.im_module(x.real, *args, **kwargs),
-        )
-ComplexConv2d = functools.partial(ArgsComplexMultiplicationWrapper, nn.Conv2d)
-ComplexConvTranspose2d = functools.partial(ArgsComplexMultiplicationWrapper, nn.ConvTranspose2d)

+import functools
+import numpy as np
+import torch
+import torch.nn as nn
+from sgmse.util.registry import Registry
+BackboneRegistry = Registry("Backbone")
+class GaussianFourierProjection(nn.Module):
+    """Gaussian random features for encoding time steps."""
+    def __init__(self, embed_dim, scale=16, complex_valued=False):
+        super().__init__()
+        self.complex_valued = complex_valued
+        if not complex_valued:
+            # If the output is real-valued, we concatenate sin+cos of the features to avoid ambiguities.
+            # Therefore, in this case the effective embed_dim is cut in half. For the complex-valued case,
+            # we use complex numbers which each represent sin+cos directly, so the ambiguity is avoided directly,
+            # and this halving is not necessary.
+            embed_dim = embed_dim // 2
+        # Randomly sample weights during initialization. These weights are fixed
+        # during optimization and are not trainable.
+        self.W = nn.Parameter(torch.randn(embed_dim) * scale, requires_grad=False)
+    def forward(self, t):
+        t_proj = t[:, None] * self.W[None, :] * 2*np.pi
+        if self.complex_valued:
+            return torch.exp(1j * t_proj)
+        else:
+            return torch.cat([torch.sin(t_proj), torch.cos(t_proj)], dim=-1)
+class DiffusionStepEmbedding(nn.Module):
+    """Diffusion-Step embedding as in DiffWave / Vaswani et al. 2017."""
+    def __init__(self, embed_dim, complex_valued=False):
+        super().__init__()
+        self.complex_valued = complex_valued
+        if not complex_valued:
+            # If the output is real-valued, we concatenate sin+cos of the features to avoid ambiguities.
+            # Therefore, in this case the effective embed_dim is cut in half. For the complex-valued case,
+            # we use complex numbers which each represent sin+cos directly, so the ambiguity is avoided directly,
+            # and this halving is not necessary.
+            embed_dim = embed_dim // 2
+        self.embed_dim = embed_dim
+    def forward(self, t):
+        fac = 10**(4*torch.arange(self.embed_dim, device=t.device) / (self.embed_dim-1))
+        inner = t[:, None] * fac[None, :]
+        if self.complex_valued:
+            return torch.exp(1j * inner)
+        else:
+            return torch.cat([torch.sin(inner), torch.cos(inner)], dim=-1)
+class ComplexLinear(nn.Module):
+    """A potentially complex-valued linear layer. Reduces to a regular linear layer if `complex_valued=False`."""
+    def __init__(self, input_dim, output_dim, complex_valued):
+        super().__init__()
+        self.complex_valued = complex_valued
+        if self.complex_valued:
+            self.re = nn.Linear(input_dim, output_dim)
+            self.im = nn.Linear(input_dim, output_dim)
+        else:
+            self.lin = nn.Linear(input_dim, output_dim)
+    def forward(self, x):
+        if self.complex_valued:
+            return (self.re(x.real) - self.im(x.imag)) + 1j*(self.re(x.imag) + self.im(x.real))
+        else:
+            return self.lin(x)
+class FeatureMapDense(nn.Module):
+    """A fully connected layer that reshapes outputs to feature maps."""
+    def __init__(self, input_dim, output_dim, complex_valued=False):
+        super().__init__()
+        self.complex_valued = complex_valued
+        self.dense = ComplexLinear(input_dim, output_dim, complex_valued=complex_valued)
+    def forward(self, x):
+        return self.dense(x)[..., None, None]
+def torch_complex_from_reim(re, im):
+    return torch.view_as_complex(torch.stack([re, im], dim=-1))
+class ArgsComplexMultiplicationWrapper(nn.Module):
+    """Adapted from `asteroid`'s `complex_nn.py`, allowing args/kwargs to be passed through forward().
+    Make a complex-valued module `F` from a real-valued module `f` by applying
+    complex multiplication rules:
+    F(a + i b) = f1(a) - f1(b) + i (f2(b) + f2(a))
+    where `f1`, `f2` are instances of `f` that do *not* share weights.
+    Args:
+        module_cls (callable): A class or function that returns a Torch module/functional.
+            Constructor of `f` in the formula above.  Called 2x with `*args`, `**kwargs`,
+            to construct the real and imaginary component modules.
+    """
+    def __init__(self, module_cls, *args, **kwargs):
+        super().__init__()
+        self.re_module = module_cls(*args, **kwargs)
+        self.im_module = module_cls(*args, **kwargs)
+    def forward(self, x, *args, **kwargs):
+        return torch_complex_from_reim(
+            self.re_module(x.real, *args, **kwargs) - self.im_module(x.imag, *args, **kwargs),
+            self.re_module(x.imag, *args, **kwargs) + self.im_module(x.real, *args, **kwargs),
+        )
+ComplexConv2d = functools.partial(ArgsComplexMultiplicationWrapper, nn.Conv2d)
+ComplexConvTranspose2d = functools.partial(ArgsComplexMultiplicationWrapper, nn.ConvTranspose2d)

sgmse/data_module.py CHANGED Viewed

@@ -1,236 +1,236 @@
-from os.path import join
-import torch
-import pytorch_lightning as pl
-from torch.utils.data import Dataset
-from torch.utils.data import DataLoader
-from glob import glob
-from torchaudio import load
-import numpy as np
-import torch.nn.functional as F
-def get_window(window_type, window_length):
-    if window_type == 'sqrthann':
-        return torch.sqrt(torch.hann_window(window_length, periodic=True))
-    elif window_type == 'hann':
-        return torch.hann_window(window_length, periodic=True)
-    else:
-        raise NotImplementedError(f"Window type {window_type} not implemented!")
-class Specs(Dataset):
-    def __init__(self, data_dir, subset, dummy, shuffle_spec, num_frames,
-            format='default', normalize="noisy", spec_transform=None,
-            stft_kwargs=None, **ignored_kwargs):
-        # Read file paths according to file naming format.
-        if format == "default":
-            self.clean_files = []
-            self.clean_files += sorted(glob(join(data_dir, subset, "clean", "*.wav")))
-            self.clean_files += sorted(glob(join(data_dir, subset, "clean", "**", "*.wav")))
-            self.noisy_files = []
-            self.noisy_files += sorted(glob(join(data_dir, subset, "noisy", "*.wav")))
-            self.noisy_files += sorted(glob(join(data_dir, subset, "noisy", "**", "*.wav")))
-        elif format == "reverb":
-            self.clean_files = []
-            self.clean_files += sorted(glob(join(data_dir, subset, "anechoic", "*.wav")))
-            self.clean_files += sorted(glob(join(data_dir, subset, "anechoic", "**", "*.wav")))
-            self.noisy_files = []
-            self.noisy_files += sorted(glob(join(data_dir, subset, "reverb", "*.wav")))
-            self.noisy_files += sorted(glob(join(data_dir, subset, "reverb", "**", "*.wav")))
-        else:
-            # Feel free to add your own directory format
-            raise NotImplementedError(f"Directory format {format} unknown!")
-        self.dummy = dummy
-        self.num_frames = num_frames
-        self.shuffle_spec = shuffle_spec
-        self.normalize = normalize
-        self.spec_transform = spec_transform
-        assert all(k in stft_kwargs.keys() for k in ["n_fft", "hop_length", "center", "window"]), "misconfigured STFT kwargs"
-        self.stft_kwargs = stft_kwargs
-        self.hop_length = self.stft_kwargs["hop_length"]
-        assert self.stft_kwargs.get("center", None) == True, "'center' must be True for current implementation"
-    def __getitem__(self, i):
-        x, _ = load(self.clean_files[i])
-        y, _ = load(self.noisy_files[i])
-        # formula applies for center=True
-        target_len = (self.num_frames - 1) * self.hop_length
-        current_len = x.size(-1)
-        pad = max(target_len - current_len, 0)
-        if pad == 0:
-            # extract random part of the audio file
-            if self.shuffle_spec:
-                start = int(np.random.uniform(0, current_len-target_len))
-            else:
-                start = int((current_len-target_len)/2)
-            x = x[..., start:start+target_len]
-            y = y[..., start:start+target_len]
-        else:
-            # pad audio if the length T is smaller than num_frames
-            x = F.pad(x, (pad//2, pad//2+(pad%2)), mode='constant')
-            y = F.pad(y, (pad//2, pad//2+(pad%2)), mode='constant')
-        # normalize w.r.t to the noisy or the clean signal or not at all
-        # to ensure same clean signal power in x and y.
-        if self.normalize == "noisy":
-            normfac = y.abs().max()
-        elif self.normalize == "clean":
-            normfac = x.abs().max()
-        elif self.normalize == "not":
-            normfac = 1.0
-        x = x / normfac
-        y = y / normfac
-        X = torch.stft(x, **self.stft_kwargs)
-        Y = torch.stft(y, **self.stft_kwargs)
-        X, Y = self.spec_transform(X), self.spec_transform(Y)
-        return X, Y
-    def __len__(self):
-        if self.dummy:
-            # for debugging shrink the data set size
-            return int(len(self.clean_files)/200)
-        else:
-            return len(self.clean_files)
-class SpecsDataModule(pl.LightningDataModule):
-    @staticmethod
-    def add_argparse_args(parser):
-        parser.add_argument("--base_dir", type=str, required=True, help="The base directory of the dataset. Should contain `train`, `valid` and `test` subdirectories, each of which contain `clean` and `noisy` subdirectories.")
-        parser.add_argument("--format", type=str, choices=("default", "reverb"), default="default", help="Read file paths according to file naming format.")
-        parser.add_argument("--batch_size", type=int, default=8, help="The batch size. 8 by default.")
-        parser.add_argument("--n_fft", type=int, default=510, help="Number of FFT bins. 510 by default.")   # to assure 256 freq bins
-        parser.add_argument("--hop_length", type=int, default=128, help="Window hop length. 128 by default.")
-        parser.add_argument("--num_frames", type=int, default=256, help="Number of frames for the dataset. 256 by default.")
-        parser.add_argument("--window", type=str, choices=("sqrthann", "hann"), default="hann", help="The window function to use for the STFT. 'hann' by default.")
-        parser.add_argument("--num_workers", type=int, default=16, help="Number of workers to use for DataLoaders. 4 by default.")
-        parser.add_argument("--dummy", action="store_true", help="Use reduced dummy dataset for prototyping.")
-        parser.add_argument("--spec_factor", type=float, default=0.15, help="Factor to multiply complex STFT coefficients by. 0.15 by default.")
-        parser.add_argument("--spec_abs_exponent", type=float, default=0.5, help="Exponent e for the transformation abs(z)**e * exp(1j*angle(z)). 0.5 by default.")
-        parser.add_argument("--normalize", type=str, choices=("clean", "noisy", "not"), default="noisy", help="Normalize the input waveforms by the clean signal, the noisy signal, or not at all.")
-        parser.add_argument("--transform_type", type=str, choices=("exponent", "log", "none"), default="exponent", help="Spectogram transformation for input representation.")
-        return parser
-    def __init__(
-        self, base_dir, format='default', batch_size=8,
-        n_fft=510, hop_length=128, num_frames=256, window='hann',
-        num_workers=4, dummy=False, spec_factor=0.15, spec_abs_exponent=0.5,
-        gpu=True, normalize='noisy', transform_type="exponent", **kwargs
-    ):
-        super().__init__()
-        self.base_dir = base_dir
-        self.format = format
-        self.batch_size = batch_size
-        self.n_fft = n_fft
-        self.hop_length = hop_length
-        self.num_frames = num_frames
-        self.window = get_window(window, self.n_fft)
-        self.windows = {}
-        self.num_workers = num_workers
-        self.dummy = dummy
-        self.spec_factor = spec_factor
-        self.spec_abs_exponent = spec_abs_exponent
-        self.gpu = gpu
-        self.normalize = normalize
-        self.transform_type = transform_type
-        self.kwargs = kwargs
-    def setup(self, stage=None):
-        specs_kwargs = dict(
-            stft_kwargs=self.stft_kwargs, num_frames=self.num_frames,
-            spec_transform=self.spec_fwd, **self.kwargs
-        )
-        if stage == 'fit' or stage is None:
-            self.train_set = Specs(data_dir=self.base_dir, subset='train',
-                dummy=self.dummy, shuffle_spec=True, format=self.format,
-                normalize=self.normalize, **specs_kwargs)
-            self.valid_set = Specs(data_dir=self.base_dir, subset='valid',
-                dummy=self.dummy, shuffle_spec=False, format=self.format,
-                normalize=self.normalize, **specs_kwargs)
-        if stage == 'test' or stage is None:
-            self.test_set = Specs(data_dir=self.base_dir, subset='test',
-                dummy=self.dummy, shuffle_spec=False, format=self.format,
-                normalize=self.normalize, **specs_kwargs)
-    def spec_fwd(self, spec):
-        if self.transform_type == "exponent":
-            if self.spec_abs_exponent != 1:
-                # only do this calculation if spec_exponent != 1, otherwise it's quite a bit of wasted computation
-                # and introduced numerical error
-                e = self.spec_abs_exponent
-                spec = spec.abs()**e * torch.exp(1j * spec.angle())
-            spec = spec * self.spec_factor
-        elif self.transform_type == "log":
-            spec = torch.log(1 + spec.abs()) * torch.exp(1j * spec.angle())
-            spec = spec * self.spec_factor
-        elif self.transform_type == "none":
-            spec = spec
-        return spec
-    def spec_back(self, spec):
-        if self.transform_type == "exponent":
-            spec = spec / self.spec_factor
-            if self.spec_abs_exponent != 1:
-                e = self.spec_abs_exponent
-                spec = spec.abs()**(1/e) * torch.exp(1j * spec.angle())
-        elif self.transform_type == "log":
-            spec = spec / self.spec_factor
-            spec = (torch.exp(spec.abs()) - 1) * torch.exp(1j * spec.angle())
-        elif self.transform_type == "none":
-            spec = spec
-        return spec
-    @property
-    def stft_kwargs(self):
-        return {**self.istft_kwargs, "return_complex": True}
-    @property
-    def istft_kwargs(self):
-        return dict(
-            n_fft=self.n_fft, hop_length=self.hop_length,
-            window=self.window, center=True
-        )
-    def _get_window(self, x):
-        """
-        Retrieve an appropriate window for the given tensor x, matching the device.
-        Caches the retrieved windows so that only one window tensor will be allocated per device.
-        """
-        window = self.windows.get(x.device, None)
-        if window is None:
-            window = self.window.to(x.device)
-            self.windows[x.device] = window
-        return window
-    def stft(self, sig):
-        window = self._get_window(sig)
-        return torch.stft(sig, **{**self.stft_kwargs, "window": window})
-    def istft(self, spec, length=None):
-        window = self._get_window(spec)
-        return torch.istft(spec, **{**self.istft_kwargs, "window": window, "length": length})
-    def train_dataloader(self):
-        return DataLoader(
-            self.train_set, batch_size=self.batch_size,
-            num_workers=self.num_workers, pin_memory=self.gpu, shuffle=True
-        )
-    def val_dataloader(self):
-        return DataLoader(
-            self.valid_set, batch_size=self.batch_size,
-            num_workers=self.num_workers, pin_memory=self.gpu, shuffle=False
-        )
-    def test_dataloader(self):
-        return DataLoader(
-            self.test_set, batch_size=self.batch_size,
-            num_workers=self.num_workers, pin_memory=self.gpu, shuffle=False
-        )

+from os.path import join
+import torch
+import pytorch_lightning as pl
+from torch.utils.data import Dataset
+from torch.utils.data import DataLoader
+from glob import glob
+from torchaudio import load
+import numpy as np
+import torch.nn.functional as F
+def get_window(window_type, window_length):
+    if window_type == 'sqrthann':
+        return torch.sqrt(torch.hann_window(window_length, periodic=True))
+    elif window_type == 'hann':
+        return torch.hann_window(window_length, periodic=True)
+    else:
+        raise NotImplementedError(f"Window type {window_type} not implemented!")
+class Specs(Dataset):
+    def __init__(self, data_dir, subset, dummy, shuffle_spec, num_frames,
+            format='default', normalize="noisy", spec_transform=None,
+            stft_kwargs=None, **ignored_kwargs):
+        # Read file paths according to file naming format.
+        if format == "default":
+            self.clean_files = []
+            self.clean_files += sorted(glob(join(data_dir, subset, "clean", "*.wav")))
+            self.clean_files += sorted(glob(join(data_dir, subset, "clean", "**", "*.wav")))
+            self.noisy_files = []
+            self.noisy_files += sorted(glob(join(data_dir, subset, "noisy", "*.wav")))
+            self.noisy_files += sorted(glob(join(data_dir, subset, "noisy", "**", "*.wav")))
+        elif format == "reverb":
+            self.clean_files = []
+            self.clean_files += sorted(glob(join(data_dir, subset, "anechoic", "*.wav")))
+            self.clean_files += sorted(glob(join(data_dir, subset, "anechoic", "**", "*.wav")))
+            self.noisy_files = []
+            self.noisy_files += sorted(glob(join(data_dir, subset, "reverb", "*.wav")))
+            self.noisy_files += sorted(glob(join(data_dir, subset, "reverb", "**", "*.wav")))
+        else:
+            # Feel free to add your own directory format
+            raise NotImplementedError(f"Directory format {format} unknown!")
+        self.dummy = dummy
+        self.num_frames = num_frames
+        self.shuffle_spec = shuffle_spec
+        self.normalize = normalize
+        self.spec_transform = spec_transform
+        assert all(k in stft_kwargs.keys() for k in ["n_fft", "hop_length", "center", "window"]), "misconfigured STFT kwargs"
+        self.stft_kwargs = stft_kwargs
+        self.hop_length = self.stft_kwargs["hop_length"]
+        assert self.stft_kwargs.get("center", None) == True, "'center' must be True for current implementation"
+    def __getitem__(self, i):
+        x, _ = load(self.clean_files[i])
+        y, _ = load(self.noisy_files[i])
+        # formula applies for center=True
+        target_len = (self.num_frames - 1) * self.hop_length
+        current_len = x.size(-1)
+        pad = max(target_len - current_len, 0)
+        if pad == 0:
+            # extract random part of the audio file
+            if self.shuffle_spec:
+                start = int(np.random.uniform(0, current_len-target_len))
+            else:
+                start = int((current_len-target_len)/2)
+            x = x[..., start:start+target_len]
+            y = y[..., start:start+target_len]
+        else:
+            # pad audio if the length T is smaller than num_frames
+            x = F.pad(x, (pad//2, pad//2+(pad%2)), mode='constant')
+            y = F.pad(y, (pad//2, pad//2+(pad%2)), mode='constant')
+        # normalize w.r.t to the noisy or the clean signal or not at all
+        # to ensure same clean signal power in x and y.
+        if self.normalize == "noisy":
+            normfac = y.abs().max()
+        elif self.normalize == "clean":
+            normfac = x.abs().max()
+        elif self.normalize == "not":
+            normfac = 1.0
+        x = x / normfac
+        y = y / normfac
+        X = torch.stft(x, **self.stft_kwargs)
+        Y = torch.stft(y, **self.stft_kwargs)
+        X, Y = self.spec_transform(X), self.spec_transform(Y)
+        return X, Y
+    def __len__(self):
+        if self.dummy:
+            # for debugging shrink the data set size
+            return int(len(self.clean_files)/200)
+        else:
+            return len(self.clean_files)
+class SpecsDataModule(pl.LightningDataModule):
+    @staticmethod
+    def add_argparse_args(parser):
+        parser.add_argument("--base_dir", type=str, required=True, help="The base directory of the dataset. Should contain `train`, `valid` and `test` subdirectories, each of which contain `clean` and `noisy` subdirectories.")
+        parser.add_argument("--format", type=str, choices=("default", "reverb"), default="default", help="Read file paths according to file naming format.")
+        parser.add_argument("--batch_size", type=int, default=8, help="The batch size. 8 by default.")
+        parser.add_argument("--n_fft", type=int, default=510, help="Number of FFT bins. 510 by default.")   # to assure 256 freq bins
+        parser.add_argument("--hop_length", type=int, default=128, help="Window hop length. 128 by default.")
+        parser.add_argument("--num_frames", type=int, default=256, help="Number of frames for the dataset. 256 by default.")
+        parser.add_argument("--window", type=str, choices=("sqrthann", "hann"), default="hann", help="The window function to use for the STFT. 'hann' by default.")
+        parser.add_argument("--num_workers", type=int, default=4, help="Number of workers to use for DataLoaders. 4 by default.")
+        parser.add_argument("--dummy", action="store_true", help="Use reduced dummy dataset for prototyping.")
+        parser.add_argument("--spec_factor", type=float, default=0.15, help="Factor to multiply complex STFT coefficients by. 0.15 by default.")
+        parser.add_argument("--spec_abs_exponent", type=float, default=0.5, help="Exponent e for the transformation abs(z)**e * exp(1j*angle(z)). 0.5 by default.")
+        parser.add_argument("--normalize", type=str, choices=("clean", "noisy", "not"), default="noisy", help="Normalize the input waveforms by the clean signal, the noisy signal, or not at all.")
+        parser.add_argument("--transform_type", type=str, choices=("exponent", "log", "none"), default="exponent", help="Spectogram transformation for input representation.")
+        return parser
+    def __init__(
+        self, base_dir, format='default', batch_size=8,
+        n_fft=510, hop_length=128, num_frames=256, window='hann',
+        num_workers=4, dummy=False, spec_factor=0.15, spec_abs_exponent=0.5,
+        gpu=True, normalize='noisy', transform_type="exponent", **kwargs
+    ):
+        super().__init__()
+        self.base_dir = base_dir
+        self.format = format
+        self.batch_size = batch_size
+        self.n_fft = n_fft
+        self.hop_length = hop_length
+        self.num_frames = num_frames
+        self.window = get_window(window, self.n_fft)
+        self.windows = {}
+        self.num_workers = num_workers
+        self.dummy = dummy
+        self.spec_factor = spec_factor
+        self.spec_abs_exponent = spec_abs_exponent
+        self.gpu = gpu
+        self.normalize = normalize
+        self.transform_type = transform_type
+        self.kwargs = kwargs
+    def setup(self, stage=None):
+        specs_kwargs = dict(
+            stft_kwargs=self.stft_kwargs, num_frames=self.num_frames,
+            spec_transform=self.spec_fwd, **self.kwargs
+        )
+        if stage == 'fit' or stage is None:
+            self.train_set = Specs(data_dir=self.base_dir, subset='train',
+                dummy=self.dummy, shuffle_spec=True, format=self.format,
+                normalize=self.normalize, **specs_kwargs)
+            self.valid_set = Specs(data_dir=self.base_dir, subset='valid',
+                dummy=self.dummy, shuffle_spec=False, format=self.format,
+                normalize=self.normalize, **specs_kwargs)
+        if stage == 'test' or stage is None:
+            self.test_set = Specs(data_dir=self.base_dir, subset='test',
+                dummy=self.dummy, shuffle_spec=False, format=self.format,
+                normalize=self.normalize, **specs_kwargs)
+    def spec_fwd(self, spec):
+        if self.transform_type == "exponent":
+            if self.spec_abs_exponent != 1:
+                # only do this calculation if spec_exponent != 1, otherwise it's quite a bit of wasted computation
+                # and introduced numerical error
+                e = self.spec_abs_exponent
+                spec = spec.abs()**e * torch.exp(1j * spec.angle())
+            spec = spec * self.spec_factor
+        elif self.transform_type == "log":
+            spec = torch.log(1 + spec.abs()) * torch.exp(1j * spec.angle())
+            spec = spec * self.spec_factor
+        elif self.transform_type == "none":
+            spec = spec
+        return spec
+    def spec_back(self, spec):
+        if self.transform_type == "exponent":
+            spec = spec / self.spec_factor
+            if self.spec_abs_exponent != 1:
+                e = self.spec_abs_exponent
+                spec = spec.abs()**(1/e) * torch.exp(1j * spec.angle())
+        elif self.transform_type == "log":
+            spec = spec / self.spec_factor
+            spec = (torch.exp(spec.abs()) - 1) * torch.exp(1j * spec.angle())
+        elif self.transform_type == "none":
+            spec = spec
+        return spec
+    @property
+    def stft_kwargs(self):
+        return {**self.istft_kwargs, "return_complex": True}
+    @property
+    def istft_kwargs(self):
+        return dict(
+            n_fft=self.n_fft, hop_length=self.hop_length,
+            window=self.window, center=True
+        )
+    def _get_window(self, x):
+        """
+        Retrieve an appropriate window for the given tensor x, matching the device.
+        Caches the retrieved windows so that only one window tensor will be allocated per device.
+        """
+        window = self.windows.get(x.device, None)
+        if window is None:
+            window = self.window.to(x.device)
+            self.windows[x.device] = window
+        return window
+    def stft(self, sig):
+        window = self._get_window(sig)
+        return torch.stft(sig, **{**self.stft_kwargs, "window": window})
+    def istft(self, spec, length=None):
+        window = self._get_window(spec)
+        return torch.istft(spec, **{**self.istft_kwargs, "window": window, "length": length})
+    def train_dataloader(self):
+        return DataLoader(
+            self.train_set, batch_size=self.batch_size,
+            num_workers=self.num_workers, pin_memory=self.gpu, shuffle=True
+        )
+    def val_dataloader(self):
+        return DataLoader(
+            self.valid_set, batch_size=self.batch_size,
+            num_workers=self.num_workers, pin_memory=self.gpu, shuffle=False
+        )
+    def test_dataloader(self):
+        return DataLoader(
+            self.test_set, batch_size=self.batch_size,
+            num_workers=self.num_workers, pin_memory=self.gpu, shuffle=False
+        )

sgmse/model.py CHANGED Viewed

@@ -1,468 +1,471 @@
-import time
-from math import ceil
-import warnings
-import torch
-import pytorch_lightning as pl
-import torch.distributed as dist
-from torchaudio import load
-from torch_ema import ExponentialMovingAverage
-from librosa import resample
-from sgmse import sampling
-from sgmse.sdes import SDERegistry
-from sgmse.backbones import BackboneRegistry
-from sgmse.util.inference import evaluate_model
-from sgmse.util.other import pad_spec, si_sdr
-from pesq import pesq
-from pystoi import stoi
-from torch_pesq import PesqLoss
-import time
-class ScoreModel(pl.LightningModule):
-    @staticmethod
-    def add_argparse_args(parser):
-        parser.add_argument("--lr", type=float, default=1e-4, help="The learning rate (1e-4 by default)")
-        parser.add_argument("--ema_decay", type=float, default=0.999, help="The parameter EMA decay constant (0.999 by default)")
-        parser.add_argument("--t_eps", type=float, default=0.03, help="The minimum process time (0.03 by default)")
-        parser.add_argument("--num_eval_files", type=int, default=50, help="Number of files for speech enhancement performance evaluation during training. Pass 0 to turn off (no checkpoints based on evaluation metrics will be generated).")
-        parser.add_argument("--loss_type", type=str, default="score_matching", help="The type of loss function to use.")
-        parser.add_argument("--loss_weighting", type=str, default="sigma^2", help="The weighting of the loss function.")
-        parser.add_argument("--network_scaling", type=str, default=None, help="The type of loss scaling to use.")
-        parser.add_argument("--c_in", type=str, default="1", help="The input scaling for x.")
-        parser.add_argument("--c_out", type=str, default="1", help="The output scaling.")
-        parser.add_argument("--c_skip", type=str, default="0", help="The skip connection scaling.")
-        parser.add_argument("--sigma_data", type=float, default=0.1, help="The data standard deviation.")
-        parser.add_argument("--l1_weight", type=float, default=0.001, help="The balance between the time-frequency and time-domain losses.")
-        parser.add_argument("--pesq_weight", type=float, default=0.0, help="The balance between the time-frequency and time-domain losses.")
-        parser.add_argument("--sr", type=int, default=16000, help="The sample rate of the audio files.")
-        return parser
-    def __init__(
-        self, backbone, sde, lr=1e-4, ema_decay=0.999, t_eps=0.03, num_eval_files=20, loss_type='score_matching',
-        loss_weighting='sigma^2', network_scaling=None, c_in='1', c_out='1', c_skip='0', sigma_data=0.1,
-        l1_weight=0.001, pesq_weight=0.0, sr=16000, data_module_cls=None, **kwargs
-    ):
-        """
-        Create a new ScoreModel.
-        Args:
-            backbone: Backbone DNN that serves as a score-based model.
-            sde: The SDE that defines the diffusion process.
-            lr: The learning rate of the optimizer. (1e-4 by default).
-            ema_decay: The decay constant of the parameter EMA (0.999 by default).
-            t_eps: The minimum time to practically run for to avoid issues very close to zero (1e-5 by default).
-            loss_type: The type of loss to use (wrt. noise z/std). Options are 'mse' (default), 'mae'
-        """
-        super().__init__()
-        # Initialize Backbone DNN
-        self.backbone = backbone
-        dnn_cls = BackboneRegistry.get_by_name(backbone)
-        self.dnn = dnn_cls(**kwargs)
-        # Initialize SDE
-        sde_cls = SDERegistry.get_by_name(sde)
-        self.sde = sde_cls(**kwargs)
-        # Store hyperparams and save them
-        self.lr = lr
-        self.ema_decay = ema_decay
-        self.ema = ExponentialMovingAverage(self.parameters(), decay=self.ema_decay)
-        self._error_loading_ema = False
-        self.t_eps = t_eps
-        self.loss_type = loss_type
-        self.loss_weighting = loss_weighting
-        self.l1_weight = l1_weight
-        self.pesq_weight = pesq_weight
-        self.network_scaling = network_scaling
-        self.c_in = c_in
-        self.c_out = c_out
-        self.c_skip = c_skip
-        self.sigma_data = sigma_data
-        self.num_eval_files = num_eval_files
-        self.sr = sr
-        # Initialize PESQ loss if pesq_weight > 0.0
-        if pesq_weight > 0.0:
-            self.pesq_loss = PesqLoss(1.0, sample_rate=sr).eval()
-            for param in self.pesq_loss.parameters():
-                param.requires_grad = False
-        self.save_hyperparameters(ignore=['no_wandb'])
-        self.data_module = data_module_cls(**kwargs, gpu=kwargs.get('gpus', 0) > 0)
-    def configure_optimizers(self):
-        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
-        return optimizer
-    def optimizer_step(self, *args, **kwargs):
-        # Method overridden so that the EMA params are updated after each optimizer step
-        super().optimizer_step(*args, **kwargs)
-        self.ema.update(self.dnn.parameters())
-    # on_load_checkpoint / on_save_checkpoint needed for EMA storing/loading
-    def on_load_checkpoint(self, checkpoint):
-        ema = checkpoint.get('ema', None)
-        if ema is not None:
-            self.ema.load_state_dict(checkpoint['ema'])
-        else:
-            self._error_loading_ema = True
-            warnings.warn("EMA state_dict not found in checkpoint!")
-    def on_save_checkpoint(self, checkpoint):
-        checkpoint['ema'] = self.ema.state_dict()
-    def train(self, mode, no_ema=False):
-        res = super().train(mode)  # call the standard `train` method with the given mode
-        if not self._error_loading_ema:
-            if mode == False and not no_ema:
-                # eval
-                self.ema.store(self.dnn.parameters())        # store current params in EMA
-                self.ema.copy_to(self.dnn.parameters())      # copy EMA parameters over current params for evaluation
-            else:
-                # train
-                if self.ema.collected_params is not None:
-                    self.ema.restore(self.dnn.parameters())  # restore the EMA weights (if stored)
-        return res
-    def eval(self, no_ema=False):
-        return self.train(False, no_ema=no_ema)
-    def _loss(self, forward_out, x_t, z, t, mean, x):
-        """
-        Different loss functions can be used to train the score model, see the paper:
-        Julius Richter, Danilo de Oliveira, and Timo Gerkmann
-        "Investigating Training Objectives for Generative Speech Enhancement"
-        https://arxiv.org/abs/2409.10753
-        """
-        sigma = self.sde._std(t)[:, None, None, None]
-        if self.loss_type == "score_matching":
-            score = forward_out
-            if self.loss_weighting == "sigma^2":
-                losses = torch.square(torch.abs(score * sigma + z)) # Eq. (7)
-            else:
-                raise ValueError("Invalid loss weighting for loss_type=score_matching: {}".format(self.loss_weighting))
-            # Sum over spatial dimensions and channels and mean over batch
-            loss = torch.mean(0.5*torch.sum(losses.reshape(losses.shape[0], -1), dim=-1))
-        elif self.loss_type == "denoiser":
-            score = forward_out
-            D = score * sigma.pow(2) + x_t # equivalent to Eq. (10)
-            losses = torch.square(torch.abs(D - mean)) # Eq. (8)
-            if self.loss_weighting == "1":
-                losses = losses
-            elif self.loss_weighting == "sigma^2":
-                losses = losses * sigma**2
-            elif self.loss_weighting == "edm":
-                losses = ((sigma**2 + self.sigma_data**2)/((sigma*self.sigma_data)**2))[:, None, None, None] * losses
-            else:
-                raise ValueError("Invalid loss weighting for loss_type=denoiser: {}".format(self.loss_weighting))
-            # Sum over spatial dimensions and channels and mean over batch
-            loss = torch.mean(0.5*torch.sum(losses.reshape(losses.shape[0], -1), dim=-1))
-        elif self.loss_type == "data_prediction":
-            x_hat = forward_out
-            B, C, F, T = x.shape
-            # losses in the time-frequency domain (tf)
-            losses_tf = (1/(F*T))*torch.square(torch.abs(x_hat - x))
-            losses_tf = torch.mean(0.5*torch.sum(losses_tf.reshape(losses_tf.shape[0], -1), dim=-1))
-            # losses in the time domain (td)
-            target_len = (self.data_module.num_frames - 1) * self.data_module.hop_length
-            x_hat_td = self.to_audio(x_hat.squeeze(), target_len)
-            x_td = self.to_audio(x.squeeze(), target_len)
-            losses_l1 = (1 / target_len) * torch.abs(x_hat_td - x_td)
-            losses_l1 = torch.mean(0.5*torch.sum(losses_l1.reshape(losses_l1.shape[0], -1), dim=-1))
-            # losses using PESQ
-            if self.pesq_weight > 0.0:
-                losses_pesq = self.pesq_loss(x_td, x_hat_td)
-                losses_pesq = torch.mean(losses_pesq)
-                # combine the losses
-                loss = losses_tf + self.l1_weight * losses_l1 + self.pesq_weight * losses_pesq
-            else:
-                loss = losses_tf + self.l1_weight * losses_l1
-        else:
-            raise ValueError("Invalid loss type: {}".format(self.loss_type))
-        return loss
-    def _step(self, batch, batch_idx):
-        x, y = batch
-        t = torch.rand(x.shape[0], device=x.device) * (self.sde.T - self.t_eps) + self.t_eps
-        mean, std = self.sde.marginal_prob(x, y, t)
-        z = torch.randn_like(x)  # i.i.d. normal distributed with var=0.5
-        sigma = std[:, None, None, None]
-        x_t = mean + sigma * z
-        forward_out = self(x_t, y, t)
-        loss = self._loss(forward_out, x_t, z, t, mean, x)
-        return loss
-    def training_step(self, batch, batch_idx):
-        loss = self._step(batch, batch_idx)
-        self.log('train_loss', loss, on_step=True, on_epoch=True, sync_dist=True, prog_bar=True)
-        return loss
-    def validation_step(self, batch, batch_idx):
-        # Evaluate speech enhancement performance
-        if batch_idx == 0 and self.num_eval_files != 0:
-            rank = dist.get_rank()
-            world_size = dist.get_world_size()
-            # Split the evaluation files among the GPUs
-            eval_files_per_gpu = self.num_eval_files // world_size
-            clean_files = self.data_module.valid_set.clean_files[:self.num_eval_files]
-            noisy_files = self.data_module.valid_set.noisy_files[:self.num_eval_files]
-            print(f"Process: {len(clean_files)} files")
-            # Select the files for this GPU
-            if rank == world_size - 1:
-                clean_files = clean_files[rank*eval_files_per_gpu:]
-                noisy_files = noisy_files[rank*eval_files_per_gpu:]
-            else:
-                clean_files = clean_files[rank*eval_files_per_gpu:(rank+1)*eval_files_per_gpu]
-                noisy_files = noisy_files[rank*eval_files_per_gpu:(rank+1)*eval_files_per_gpu]
-            # Evaluate the performance of the model
-            pesq_sum = 0; si_sdr_sum = 0; estoi_sum = 0;
-            start_time = time.time()
-            for (clean_file, noisy_file) in zip(clean_files, noisy_files):
-                # Load the clean and noisy speech
-                x, sr_x = load(clean_file)
-                x = x.squeeze().numpy()
-                y, sr_y = load(noisy_file)
-                assert sr_x == sr_y, "Sample rates of clean and noisy files do not match!"
-                # Resample if necessary
-                if sr_x != 16000:
-                    x_16k = resample(x, orig_sr=sr_x, target_sr=16000).squeeze()
-                else:
-                    x_16k = x
-                # Enhance the noisy speech
-                x_hat = self.enhance(y, N=self.sde.N)
-                if self.sr != 16000:
-                    x_hat_16k = resample(x_hat, orig_sr=self.sr, target_sr=16000).squeeze()
-                else:
-                    x_hat_16k = x_hat
-                pesq_sum += pesq(16000, x_16k, x_hat_16k, 'wb')
-                si_sdr_sum += si_sdr(x, x_hat)
-                estoi_sum += stoi(x, x_hat, self.sr, extended=True)
-            print(f"process {eval_files_per_gpu} in {time.time()-start_time}")
-            pesq_avg = pesq_sum / len(clean_files)
-            si_sdr_avg = si_sdr_sum / len(clean_files)
-            estoi_avg = estoi_sum / len(clean_files)
-            self.log('pesq', pesq_avg, on_step=False, on_epoch=True, sync_dist=True)
-            self.log('si_sdr', si_sdr_avg, on_step=False, on_epoch=True, sync_dist=True)
-            self.log('estoi', estoi_avg, on_step=False, on_epoch=True, sync_dist=True)
-        loss = self._step(batch, batch_idx)
-        self.log('valid_loss', loss, on_step=False, on_epoch=True, sync_dist=True)
-        return loss
-    def forward(self, x_t, y, t):
-        """
-        The model forward pass. In [1] and [2], the model estimates the score function. In [3], the model estimates
-        either the score function or the target data for the Schrödinger bridge (loss_type='data_prediction').
-        [1] Julius Richter, Simon Welker, Jean-Marie Lemercier, Bunlong Lay, and  Timo Gerkmann
-            "Speech Enhancement and Dereverberation with Diffusion-Based Generative Models"
-            IEEE/ACM Transactions on Audio, Speech, and Language Processing, vol. 31, pp. 2351-2364, 2023.
-        [2] Julius Richter, Yi-Chiao Wu, Steven Krenn, Simon Welker, Bunlong Lay, Shinji Watanabe, Alexander Richard, and Timo Gerkmann
-            "EARS: An Anechoic Fullband Speech Dataset Benchmarked for Speech Enhancement and Dereverberation"
-            ISCA Interspecch, Kos, Greece, Sept. 2024.
-        [3] Julius Richter, Danilo de Oliveira, and Timo Gerkmann
-            "Investigating Training Objectives for Generative Speech Enhancement"
-            https://arxiv.org/abs/2409.10753
-        """
-        # In [3], we use new code with backbone='ncsnpp_v2':
-        if self.backbone == "ncsnpp_v2":
-            F = self.dnn(self._c_in(t) * x_t, self._c_in(t) * y, t)
-            # Scaling the network output, see below Eq. (7) in the paper
-            if self.network_scaling == "1/sigma":
-                std = self.sde._std(t)
-                F = F / std[:, None, None, None]
-            elif self.network_scaling == "1/t":
-                F = F / t[:, None, None, None]
-            # The loss type determines the output of the model
-            if self.loss_type == "score_matching":
-                score = self._c_skip(t) * x_t + self._c_out(t) * F
-                return score
-            elif self.loss_type == "denoiser":
-                sigmas = self.sde._std(t)[:, None, None, None]
-                score = (F - x_t) / sigmas.pow(2)
-                return score
-            elif self.loss_type == 'data_prediction':
-                x_hat = self._c_skip(t) * x_t + self._c_out(t) * F
-                return x_hat
-        # In [1] and [2], we use the old code:
-        else:
-            dnn_input = torch.cat([x_t, y], dim=1)
-            score = -self.dnn(dnn_input, t)
-            return score
-    def _c_in(self, t):
-        if self.c_in == "1":
-            return 1.0
-        elif self.c_in == "edm":
-            sigma = self.sde._std(t)
-            return (1.0 / torch.sqrt(sigma**2 + self.sigma_data**2))[:, None, None, None]
-        else:
-            raise ValueError("Invalid c_in type: {}".format(self.c_in))
-    def _c_out(self, t):
-        if self.c_out == "1":
-            return 1.0
-        elif self.c_out == "sigma":
-            return self.sde._std(t)[:, None, None, None]
-        elif self.c_out == "1/sigma":
-            return 1.0 / self.sde._std(t)[:, None, None, None]
-        elif self.c_out == "edm":
-            sigma = self.sde._std(t)
-            return ((sigma * self.sigma_data) / torch.sqrt(self.sigma_data**2 + sigma**2))[:, None, None, None]
-        else:
-            raise ValueError("Invalid c_out type: {}".format(self.c_out))
-    def _c_skip(self, t):
-        if self.c_skip == "0":
-            return 0.0
-        elif self.c_skip == "edm":
-            sigma = self.sde._std(t)
-            return (self.sigma_data**2 / (sigma**2 + self.sigma_data**2))[:, None, None, None]
-        else:
-            raise ValueError("Invalid c_skip type: {}".format(self.c_skip))
-    def to(self, *args, **kwargs):
-        """Override PyTorch .to() to also transfer the EMA of the model weights"""
-        self.ema.to(*args, **kwargs)
-        return super().to(*args, **kwargs)
-    def get_pc_sampler(self, predictor_name, corrector_name, y, N=None, minibatch=None, **kwargs):
-        N = self.sde.N if N is None else N
-        sde = self.sde.copy()
-        sde.N = N
-        kwargs = {"eps": self.t_eps, **kwargs}
-        if minibatch is None:
-            return sampling.get_pc_sampler(predictor_name, corrector_name, sde=sde, score_fn=self, y=y, **kwargs)
-        else:
-            M = y.shape[0]
-            def batched_sampling_fn():
-                samples, ns = [], []
-                for i in range(int(ceil(M / minibatch))):
-                    y_mini = y[i*minibatch:(i+1)*minibatch]
-                    sampler = sampling.get_pc_sampler(predictor_name, corrector_name, sde=sde, score_fn=self, y=y_mini, **kwargs)
-                    sample, n = sampler()
-                    samples.append(sample)
-                    ns.append(n)
-                samples = torch.cat(samples, dim=0)
-                return samples, ns
-            return batched_sampling_fn
-    def get_ode_sampler(self, y, N=None, minibatch=None, **kwargs):
-        N = self.sde.N if N is None else N
-        sde = self.sde.copy()
-        sde.N = N
-        kwargs = {"eps": self.t_eps, **kwargs}
-        if minibatch is None:
-            return sampling.get_ode_sampler(sde, self, y=y, **kwargs)
-        else:
-            M = y.shape[0]
-            def batched_sampling_fn():
-                samples, ns = [], []
-                for i in range(int(ceil(M / minibatch))):
-                    y_mini = y[i*minibatch:(i+1)*minibatch]
-                    sampler = sampling.get_ode_sampler(sde, self, y=y_mini, **kwargs)
-                    sample, n = sampler()
-                    samples.append(sample)
-                    ns.append(n)
-                samples = torch.cat(samples, dim=0)
-                return sample, ns
-            return batched_sampling_fn
-    def get_sb_sampler(self, sde, y, sampler_type="ode", N=None, **kwargs):
-        N = sde.N if N is None else N
-        sde = self.sde.copy()
-        sde.N = N if N is not None else sde.N
-        return sampling.get_sb_sampler(sde, self, y=y, sampler_type=sampler_type, **kwargs)
-    def train_dataloader(self):
-        return self.data_module.train_dataloader()
-    def val_dataloader(self):
-        return self.data_module.val_dataloader()
-    def test_dataloader(self):
-        return self.data_module.test_dataloader()
-    def setup(self, stage=None):
-        return self.data_module.setup(stage=stage)
-    def to_audio(self, spec, length=None):
-        return self._istft(self._backward_transform(spec), length)
-    def _forward_transform(self, spec):
-        return self.data_module.spec_fwd(spec)
-    def _backward_transform(self, spec):
-        return self.data_module.spec_back(spec)
-    def _stft(self, sig):
-        return self.data_module.stft(sig)
-    def _istft(self, spec, length=None):
-        return self.data_module.istft(spec, length)
-    def enhance(self, y, sampler_type="pc", predictor="reverse_diffusion",
-        corrector="ald", N=30, corrector_steps=1, snr=0.5, timeit=False,
-        **kwargs
-    ):
-        """
-        One-call speech enhancement of noisy speech `y`, for convenience.
-        """
-        start = time.time()
-        T_orig = y.size(1)
-        norm_factor = y.abs().max().item()
-        y = y / norm_factor
-        Y = torch.unsqueeze(self._forward_transform(self._stft(y.cuda())), 0)
-        Y = pad_spec(Y)
-        # SGMSE sampling with OUVE SDE
-        if self.sde.__class__.__name__ == 'OUVESDE':
-            if self.sde.sampler_type == "pc":
-                sampler = self.get_pc_sampler(predictor, corrector, Y.cuda(), N=N,
-                    corrector_steps=corrector_steps, snr=snr, intermediate=False,
-                    **kwargs)
-            elif self.sde.sampler_type == "ode":
-                sampler = self.get_ode_sampler(Y.cuda(), N=N, **kwargs)
-            else:
-                raise ValueError("Invalid sampler type for SGMSE sampling: {}".format(sampler_type))
-        # Schrödinger bridge sampling with VE SDE
-        elif self.sde.__class__.__name__ == 'SBVESDE':
-            sampler = self.get_sb_sampler(sde=self.sde, y=Y.cuda(), sampler_type=self.sde.sampler_type)
-        else:
-            raise ValueError("Invalid SDE type for speech enhancement: {}".format(self.sde.__class__.__name__))
-        sample, nfe = sampler()
-        x_hat = self.to_audio(sample.squeeze(), T_orig)
-        x_hat = x_hat * norm_factor
-        x_hat = x_hat.squeeze().cpu().numpy()
-        end = time.time()
-        if timeit:
-            rtf = (end-start)/(len(x_hat)/self.sr)
-            return x_hat, nfe, rtf
-        else:
-            return x_hat

+import time
+from math import ceil
+import warnings
+import torch
+import pytorch_lightning as pl
+import torch.distributed as dist
+from torchaudio import load
+from torch_ema import ExponentialMovingAverage
+from librosa import resample
+from sgmse import sampling
+from sgmse.sdes import SDERegistry
+from sgmse.backbones import BackboneRegistry
+from sgmse.util.inference import evaluate_model
+from sgmse.util.other import pad_spec, si_sdr
+from pesq import pesq
+from pystoi import stoi
+from torch_pesq import PesqLoss
+import time
+class ScoreModel(pl.LightningModule):
+    @staticmethod
+    def add_argparse_args(parser):
+        parser.add_argument("--lr", type=float, default=1e-4, help="The learning rate (1e-4 by default)")
+        parser.add_argument("--ema_decay", type=float, default=0.999, help="The parameter EMA decay constant (0.999 by default)")
+        parser.add_argument("--t_eps", type=float, default=0.03, help="The minimum process time (0.03 by default)")
+        parser.add_argument("--num_eval_files", type=int, default=50, help="Number of files for speech enhancement performance evaluation during training. Pass 0 to turn off (no checkpoints based on evaluation metrics will be generated).")
+        parser.add_argument("--loss_type", type=str, default="score_matching", help="The type of loss function to use.")
+        parser.add_argument("--loss_weighting", type=str, default="sigma^2", help="The weighting of the loss function.")
+        parser.add_argument("--network_scaling", type=str, default=None, help="The type of loss scaling to use.")
+        parser.add_argument("--c_in", type=str, default="1", help="The input scaling for x.")
+        parser.add_argument("--c_out", type=str, default="1", help="The output scaling.")
+        parser.add_argument("--c_skip", type=str, default="0", help="The skip connection scaling.")
+        parser.add_argument("--sigma_data", type=float, default=0.1, help="The data standard deviation.")
+        parser.add_argument("--l1_weight", type=float, default=0.001, help="The balance between the time-frequency and time-domain losses.")
+        parser.add_argument("--pesq_weight", type=float, default=0.0, help="The balance between the time-frequency and time-domain losses.")
+        parser.add_argument("--sr", type=int, default=16000, help="The sample rate of the audio files.")
+        parser.add_argument("--k", type=float, default=2.6, help="Parameter of the diffusion coefficient. 2.6 by default.")
+        parser.add_argument("--c", type=float, default=0.4, help="Parameter of the diffusion coefficient. 0.4 by default.")
+        return parser
+    def __init__(
+        self, backbone, sde, lr=1e-4, ema_decay=0.999, t_eps=0.03, num_eval_files=20, loss_type='score_matching',
+        loss_weighting='sigma^2', network_scaling=None, c_in='1', c_out='1', c_skip='0', sigma_data=0.1,
+        l1_weight=0.001, pesq_weight=0.0, sr=16000, data_module_cls=None, **kwargs
+    ):
+        """
+        Create a new ScoreModel.
+        Args:
+            backbone: Backbone DNN that serves as a score-based model.
+            sde: The SDE that defines the diffusion process.
+            lr: The learning rate of the optimizer. (1e-4 by default).
+            ema_decay: The decay constant of the parameter EMA (0.999 by default).
+            t_eps: The minimum time to practically run for to avoid issues very close to zero (1e-5 by default).
+            loss_type: The type of loss to use (wrt. noise z/std). Options are 'mse' (default), 'mae'
+        """
+        super().__init__()
+        # Initialize Backbone DNN
+        self.backbone = backbone
+        dnn_cls = BackboneRegistry.get_by_name(backbone)
+        self.dnn = dnn_cls(**kwargs)
+        # Initialize SDE
+        sde_cls = SDERegistry.get_by_name(sde)
+        self.sde = sde_cls(**kwargs)
+        # Store hyperparams and save them
+        self.lr = lr
+        self.ema_decay = ema_decay
+        self.ema = ExponentialMovingAverage(self.parameters(), decay=self.ema_decay)
+        self._error_loading_ema = False
+        self.t_eps = t_eps
+        self.loss_type = loss_type
+        self.loss_weighting = loss_weighting
+        self.l1_weight = l1_weight
+        self.pesq_weight = pesq_weight
+        self.network_scaling = network_scaling
+        self.c_in = c_in
+        self.c_out = c_out
+        self.c_skip = c_skip
+        self.sigma_data = sigma_data
+        self.num_eval_files = num_eval_files
+        self.sr = sr
+        # Initialize PESQ loss if pesq_weight > 0.0
+        if pesq_weight > 0.0:
+            self.pesq_loss = PesqLoss(1.0, sample_rate=sr).eval()
+            for param in self.pesq_loss.parameters():
+                param.requires_grad = False
+        self.save_hyperparameters(ignore=['no_wandb'])
+        self.data_module = data_module_cls(**kwargs, gpu=kwargs.get('gpus', 0) > 0)
+    def configure_optimizers(self):
+        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
+        return optimizer
+    def optimizer_step(self, *args, **kwargs):
+        # Method overridden so that the EMA params are updated after each optimizer step
+        super().optimizer_step(*args, **kwargs)
+        self.ema.update(self.dnn.parameters())
+    # on_load_checkpoint / on_save_checkpoint needed for EMA storing/loading
+    def on_load_checkpoint(self, checkpoint):
+        ema = checkpoint.get('ema', None)
+        if ema is not None:
+            self.ema.load_state_dict(checkpoint['ema'])
+        else:
+            self._error_loading_ema = True
+            warnings.warn("EMA state_dict not found in checkpoint!")
+    def on_save_checkpoint(self, checkpoint):
+        checkpoint['ema'] = self.ema.state_dict()
+    def train(self, mode, no_ema=False):
+        res = super().train(mode)  # call the standard `train` method with the given mode
+        if not self._error_loading_ema:
+            if mode == False and not no_ema:
+                # eval
+                self.ema.store(self.dnn.parameters())        # store current params in EMA
+                self.ema.copy_to(self.dnn.parameters())      # copy EMA parameters over current params for evaluation
+            else:
+                # train
+                if self.ema.collected_params is not None:
+                    self.ema.restore(self.dnn.parameters())  # restore the EMA weights (if stored)
+        return res
+    def eval(self, no_ema=False):
+        return self.train(False, no_ema=no_ema)
+    def _loss(self, forward_out, x_t, z, t, mean, x):
+        """
+        Different loss functions can be used to train the score model, see the paper:
+        Julius Richter, Danilo de Oliveira, and Timo Gerkmann
+        "Investigating Training Objectives for Generative Speech Enhancement"
+        https://arxiv.org/abs/2409.10753
+        """
+        sigma = self.sde._std(t)[:, None, None, None]
+        if self.loss_type == "score_matching":
+            score = forward_out
+            if self.loss_weighting == "sigma^2":
+                losses = torch.square(torch.abs(score * sigma + z)) # Eq. (7)
+            else:
+                raise ValueError("Invalid loss weighting for loss_type=score_matching: {}".format(self.loss_weighting))
+            # Sum over spatial dimensions and channels and mean over batch
+            loss = torch.mean(0.5*torch.sum(losses.reshape(losses.shape[0], -1), dim=-1))
+        elif self.loss_type == "denoiser":
+            score = forward_out
+            D = score * sigma.pow(2) + x_t # equivalent to Eq. (10)
+            losses = torch.square(torch.abs(D - mean)) # Eq. (8)
+            if self.loss_weighting == "1":
+                losses = losses
+            elif self.loss_weighting == "sigma^2":
+                losses = losses * sigma**2
+            elif self.loss_weighting == "edm":
+                losses = ((sigma**2 + self.sigma_data**2)/((sigma*self.sigma_data)**2))[:, None, None, None] * losses
+            else:
+                raise ValueError("Invalid loss weighting for loss_type=denoiser: {}".format(self.loss_weighting))
+            # Sum over spatial dimensions and channels and mean over batch
+            loss = torch.mean(0.5*torch.sum(losses.reshape(losses.shape[0], -1), dim=-1))
+        elif self.loss_type == "data_prediction":
+            x_hat = forward_out
+            B, C, F, T = x.shape
+            # losses in the time-frequency domain (tf)
+            losses_tf = (1/(F*T))*torch.square(torch.abs(x_hat - x))
+            losses_tf = torch.mean(0.5*torch.sum(losses_tf.reshape(losses_tf.shape[0], -1), dim=-1))
+            # losses in the time domain (td)
+            target_len = (self.data_module.num_frames - 1) * self.data_module.hop_length
+            x_hat_td = self.to_audio(x_hat.squeeze(), target_len)
+            x_td = self.to_audio(x.squeeze(), target_len)
+            losses_l1 = (1 / target_len) * torch.abs(x_hat_td - x_td)
+            losses_l1 = torch.mean(0.5*torch.sum(losses_l1.reshape(losses_l1.shape[0], -1), dim=-1))
+            # losses using PESQ
+            if self.pesq_weight > 0.0:
+                losses_pesq = self.pesq_loss(x_td, x_hat_td)
+                losses_pesq = torch.mean(losses_pesq)
+                # combine the losses
+                loss = losses_tf + self.l1_weight * losses_l1 + self.pesq_weight * losses_pesq
+            else:
+                loss = losses_tf + self.l1_weight * losses_l1
+        else:
+            raise ValueError("Invalid loss type: {}".format(self.loss_type))
+        return loss
+    def _step(self, batch, batch_idx):
+        x, y = batch
+        t = torch.rand(x.shape[0], device=x.device) * (self.sde.T - self.t_eps) + self.t_eps
+        mean, std = self.sde.marginal_prob(x, y, t)
+        z = torch.randn_like(x)  # i.i.d. normal distributed with var=0.5
+        sigma = std[:, None, None, None]
+        x_t = mean + sigma * z
+        forward_out = self(x_t, y, t)
+        loss = self._loss(forward_out, x_t, z, t, mean, x)
+        return loss
+    def training_step(self, batch, batch_idx):
+        loss = self._step(batch, batch_idx)
+        self.log('train_loss', loss, on_step=True, on_epoch=True, sync_dist=True, prog_bar=True)
+        return loss
+    def validation_step(self, batch, batch_idx):
+        # Evaluate speech enhancement performance
+        if batch_idx == 0 and self.num_eval_files != 0:
+            rank = dist.get_rank()
+            world_size = dist.get_world_size()
+            # Split the evaluation files among the GPUs
+            eval_files_per_gpu = self.num_eval_files // world_size
+            clean_files = self.data_module.valid_set.clean_files[:self.num_eval_files]
+            noisy_files = self.data_module.valid_set.noisy_files[:self.num_eval_files]
+            print(f"Process: {len(clean_files)} files")
+            # Select the files for this GPU
+            if rank == world_size - 1:
+                clean_files = clean_files[rank*eval_files_per_gpu:]
+                noisy_files = noisy_files[rank*eval_files_per_gpu:]
+            else:
+                clean_files = clean_files[rank*eval_files_per_gpu:(rank+1)*eval_files_per_gpu]
+                noisy_files = noisy_files[rank*eval_files_per_gpu:(rank+1)*eval_files_per_gpu]
+            # Evaluate the performance of the model
+            pesq_sum = 0; si_sdr_sum = 0; estoi_sum = 0;
+            start_time = time.time()
+            for (clean_file, noisy_file) in zip(clean_files, noisy_files):
+                # Load the clean and noisy speech
+                x, sr_x = load(clean_file)
+                x = x.squeeze().numpy()
+                y, sr_y = load(noisy_file)
+                assert sr_x == sr_y, "Sample rates of clean and noisy files do not match!"
+                # Resample if necessary
+                if sr_x != 16000:
+                    x_16k = resample(x, orig_sr=sr_x, target_sr=16000).squeeze()
+                else:
+                    x_16k = x
+                # Enhance the noisy speech
+                x_hat = self.enhance(y, N=self.sde.N)
+                if self.sr != 16000:
+                    x_hat_16k = resample(x_hat, orig_sr=self.sr, target_sr=16000).squeeze()
+                else:
+                    x_hat_16k = x_hat
+                pesq_sum += pesq(16000, x_16k, x_hat_16k, 'wb')
+                si_sdr_sum += si_sdr(x, x_hat)
+                estoi_sum += stoi(x, x_hat, self.sr, extended=True)
+            print(f"process {eval_files_per_gpu} in {time.time()-start_time}")
+            pesq_avg = pesq_sum / len(clean_files)
+            si_sdr_avg = si_sdr_sum / len(clean_files)
+            estoi_avg = estoi_sum / len(clean_files)
+            self.log('pesq', pesq_avg, on_step=False, on_epoch=True, sync_dist=True)
+            self.log('si_sdr', si_sdr_avg, on_step=False, on_epoch=True, sync_dist=True)
+            self.log('estoi', estoi_avg, on_step=False, on_epoch=True, sync_dist=True)
+        loss = self._step(batch, batch_idx)
+        self.log('valid_loss', loss, on_step=False, on_epoch=True, sync_dist=True)
+        return loss
+    def forward(self, x_t, y, t):
+        """
+        The model forward pass. In [1] and [2], the model estimates the score function. In [3], the model estimates
+        either the score function or the target data for the Schrödinger bridge (loss_type='data_prediction').
+        [1] Julius Richter, Simon Welker, Jean-Marie Lemercier, Bunlong Lay, and  Timo Gerkmann
+            "Speech Enhancement and Dereverberation with Diffusion-Based Generative Models"
+            IEEE/ACM Transactions on Audio, Speech, and Language Processing, vol. 31, pp. 2351-2364, 2023.
+        [2] Julius Richter, Yi-Chiao Wu, Steven Krenn, Simon Welker, Bunlong Lay, Shinji Watanabe, Alexander Richard, and Timo Gerkmann
+            "EARS: An Anechoic Fullband Speech Dataset Benchmarked for Speech Enhancement and Dereverberation"
+            ISCA Interspecch, Kos, Greece, Sept. 2024.
+        [3] Julius Richter, Danilo de Oliveira, and Timo Gerkmann
+            "Investigating Training Objectives for Generative Speech Enhancement"
+            https://arxiv.org/abs/2409.10753
+        """
+        # In [3], we use new code with backbone='ncsnpp_v2':
+        if self.backbone == "ncsnpp_v2":
+            F = self.dnn(self._c_in(t) * x_t, self._c_in(t) * y, t)
+            # Scaling the network output, see below Eq. (7) in the paper
+            if self.network_scaling == "1/sigma":
+                std = self.sde._std(t)
+                F = F / std[:, None, None, None]
+            elif self.network_scaling == "1/t":
+                F = F / t[:, None, None, None]
+            # The loss type determines the output of the model
+            if self.loss_type == "score_matching":
+                score = self._c_skip(t) * x_t + self._c_out(t) * F
+                return score
+            elif self.loss_type == "denoiser":
+                sigmas = self.sde._std(t)[:, None, None, None]
+                score = (F - x_t) / sigmas.pow(2)
+                return score
+            elif self.loss_type == 'data_prediction':
+                x_hat = self._c_skip(t) * x_t + self._c_out(t) * F
+                return x_hat
+        # In [1] and [2], we use the old code:
+        else:
+            dnn_input = torch.cat([x_t, y], dim=1)
+            score = -self.dnn(dnn_input, t)
+            return score
+    def _c_in(self, t):
+        if self.c_in == "1":
+            return 1.0
+        elif self.c_in == "edm":
+            sigma = self.sde._std(t)
+            return (1.0 / torch.sqrt(sigma**2 + self.sigma_data**2))[:, None, None, None]
+        else:
+            raise ValueError("Invalid c_in type: {}".format(self.c_in))
+    def _c_out(self, t):
+        if self.c_out == "1":
+            return 1.0
+        elif self.c_out == "sigma":
+            return self.sde._std(t)[:, None, None, None]
+        elif self.c_out == "1/sigma":
+            return 1.0 / self.sde._std(t)[:, None, None, None]
+        elif self.c_out == "edm":
+            sigma = self.sde._std(t)
+            return ((sigma * self.sigma_data) / torch.sqrt(self.sigma_data**2 + sigma**2))[:, None, None, None]
+        else:
+            raise ValueError("Invalid c_out type: {}".format(self.c_out))
+    def _c_skip(self, t):
+        if self.c_skip == "0":
+            return 0.0
+        elif self.c_skip == "edm":
+            sigma = self.sde._std(t)
+            return (self.sigma_data**2 / (sigma**2 + self.sigma_data**2))[:, None, None, None]
+        else:
+            raise ValueError("Invalid c_skip type: {}".format(self.c_skip))
+    def to(self, *args, **kwargs):
+        """Override PyTorch .to() to also transfer the EMA of the model weights"""
+        self.ema.to(*args, **kwargs)
+        return super().to(*args, **kwargs)
+    def get_pc_sampler(self, predictor_name, corrector_name, y, N=None, minibatch=None, **kwargs):
+        N = self.sde.N if N is None else N
+        sde = self.sde.copy()
+        sde.N = N
+        kwargs = {"eps": self.t_eps, **kwargs}
+        if minibatch is None:
+            return sampling.get_pc_sampler(predictor_name, corrector_name, sde=sde, score_fn=self, y=y, **kwargs)
+        else:
+            M = y.shape[0]
+            def batched_sampling_fn():
+                samples, ns = [], []
+                for i in range(int(ceil(M / minibatch))):
+                    y_mini = y[i*minibatch:(i+1)*minibatch]
+                    sampler = sampling.get_pc_sampler(predictor_name, corrector_name, sde=sde, score_fn=self, y=y_mini, **kwargs)
+                    sample, n = sampler()
+                    samples.append(sample)
+                    ns.append(n)
+                samples = torch.cat(samples, dim=0)
+                return samples, ns
+            return batched_sampling_fn
+    def get_ode_sampler(self, y, N=None, minibatch=None, **kwargs):
+        N = self.sde.N if N is None else N
+        sde = self.sde.copy()
+        sde.N = N
+        kwargs = {"eps": self.t_eps, **kwargs}
+        if minibatch is None:
+            return sampling.get_ode_sampler(sde, self, y=y, **kwargs)
+        else:
+            M = y.shape[0]
+            def batched_sampling_fn():
+                samples, ns = [], []
+                for i in range(int(ceil(M / minibatch))):
+                    y_mini = y[i*minibatch:(i+1)*minibatch]
+                    sampler = sampling.get_ode_sampler(sde, self, y=y_mini, **kwargs)
+                    sample, n = sampler()
+                    samples.append(sample)
+                    ns.append(n)
+                samples = torch.cat(samples, dim=0)
+                return sample, ns
+            return batched_sampling_fn
+    def get_sb_sampler(self, sde, y, sampler_type="ode", N=None, **kwargs):
+        N = sde.N if N is None else N
+        sde = self.sde.copy()
+        sde.N = N if N is not None else sde.N
+        return sampling.get_sb_sampler(sde, self, y=y, sampler_type=sampler_type, **kwargs)
+    def train_dataloader(self):
+        return self.data_module.train_dataloader()
+    def val_dataloader(self):
+        return self.data_module.val_dataloader()
+    def test_dataloader(self):
+        return self.data_module.test_dataloader()
+    def setup(self, stage=None):
+        return self.data_module.setup(stage=stage)
+    def to_audio(self, spec, length=None):
+        return self._istft(self._backward_transform(spec), length)
+    def _forward_transform(self, spec):
+        return self.data_module.spec_fwd(spec)
+    def _backward_transform(self, spec):
+        return self.data_module.spec_back(spec)
+    def _stft(self, sig):
+        return self.data_module.stft(sig)
+    def _istft(self, spec, length=None):
+        return self.data_module.istft(spec, length)
+    def enhance(self, y, sampler_type="pc", predictor="reverse_diffusion",
+        corrector="ald", N=30, corrector_steps=1, snr=0.5, timeit=False,
+        **kwargs
+    ):
+        """
+        One-call speech enhancement of noisy speech `y`, for convenience.
+        """
+        start = time.time()
+        T_orig = y.size(1)
+        norm_factor = y.abs().max().item()
+        y = y / norm_factor
+        Y = torch.unsqueeze(self._forward_transform(self._stft(y.cuda())), 0)
+        Y = pad_spec(Y)
+        # SGMSE sampling with OUVE SDE
+        if self.sde.__class__.__name__ == 'OUVESDE':
+            if self.sde.sampler_type == "pc":
+                sampler = self.get_pc_sampler(predictor, corrector, Y.cuda(), N=N,
+                    corrector_steps=corrector_steps, snr=snr, intermediate=False,
+                    **kwargs)
+            elif self.sde.sampler_type == "ode":
+                sampler = self.get_ode_sampler(Y.cuda(), N=N, **kwargs)
+            else:
+                raise ValueError("Invalid sampler type for SGMSE sampling: {}".format(sampler_type))
+        # Schrödinger bridge sampling with VE SDE
+        elif self.sde.__class__.__name__ == 'SBVESDE':
+            sampler = self.get_sb_sampler(sde=self.sde, y=Y.cuda(), sampler_type=self.sde.sampler_type)
+        else:
+            raise ValueError("Invalid SDE type for speech enhancement: {}".format(self.sde.__class__.__name__))
+        sample, nfe = sampler()
+        x_hat = self.to_audio(sample.squeeze(), T_orig)
+        x_hat = x_hat * norm_factor
+        x_hat = x_hat.squeeze().cpu().numpy()
+        end = time.time()
+        if timeit:
+            rtf = (end-start)/(len(x_hat)/self.sr)
+            return x_hat, nfe, rtf
+        else:
+            return x_hat

sgmse/sampling/__init__.py CHANGED Viewed

@@ -1,249 +1,249 @@
-# Adapted from https://github.com/yang-song/score_sde_pytorch/blob/1618ddea340f3e4a2ed7852a0694a809775cf8d0/sampling.py
-"""Various sampling methods."""
-from scipy import integrate
-import torch
-from .predictors import Predictor, PredictorRegistry, ReverseDiffusionPredictor
-from .correctors import Corrector, CorrectorRegistry
-__all__ = [
-    'PredictorRegistry', 'CorrectorRegistry', 'Predictor', 'Corrector',
-    'get_sampler'
-]
-def to_flattened_numpy(x):
-    """Flatten a torch tensor `x` and convert it to numpy."""
-    return x.detach().cpu().numpy().reshape((-1,))
-def from_flattened_numpy(x, shape):
-    """Form a torch tensor with the given `shape` from a flattened numpy array `x`."""
-    return torch.from_numpy(x.reshape(shape))
-def get_pc_sampler(
-    predictor_name, corrector_name, sde, score_fn, y,
-    denoise=True, eps=3e-2, snr=0.1, corrector_steps=1, probability_flow: bool = False,
-    intermediate=False, **kwargs
-):
-    """Create a Predictor-Corrector (PC) sampler.
-    Args:
-        predictor_name: The name of a registered `sampling.Predictor`.
-        corrector_name: The name of a registered `sampling.Corrector`.
-        sde: An `sdes.SDE` object representing the forward SDE.
-        score_fn: A function (typically learned model) that predicts the score.
-        y: A `torch.Tensor`, representing the (non-white-)noisy starting point(s) to condition the prior on.
-        denoise: If `True`, add one-step denoising to the final samples.
-        eps: A `float` number. The reverse-time SDE and ODE are integrated to `epsilon` to avoid numerical issues.
-        snr: The SNR to use for the corrector. 0.1 by default, and ignored for `NoneCorrector`.
-        N: The number of reverse sampling steps. If `None`, uses the SDE's `N` property by default.
-    Returns:
-        A sampling function that returns samples and the number of function evaluations during sampling.
-    """
-    predictor_cls = PredictorRegistry.get_by_name(predictor_name)
-    corrector_cls = CorrectorRegistry.get_by_name(corrector_name)
-    predictor = predictor_cls(sde, score_fn, probability_flow=probability_flow)
-    corrector = corrector_cls(sde, score_fn, snr=snr, n_steps=corrector_steps)
-    def pc_sampler():
-        """The PC sampler function."""
-        with torch.no_grad():
-            xt = sde.prior_sampling(y.shape, y).to(y.device)
-            timesteps = torch.linspace(sde.T, eps, sde.N, device=y.device)
-            for i in range(sde.N):
-                t = timesteps[i]
-                if i != len(timesteps) - 1:
-                    stepsize = t - timesteps[i+1]
-                else:
-                    stepsize = timesteps[-1] # from eps to 0
-                vec_t = torch.ones(y.shape[0], device=y.device) * t
-                xt, xt_mean = corrector.update_fn(xt, y, vec_t)
-                xt, xt_mean = predictor.update_fn(xt, y, vec_t, stepsize)
-            x_result = xt_mean if denoise else xt
-            ns = sde.N * (corrector.n_steps + 1)
-            return x_result, ns
-    return pc_sampler
-def get_ode_sampler(
-    sde, score_fn, y, inverse_scaler=None,
-    denoise=True, rtol=1e-5, atol=1e-5,
-    method='RK45', eps=3e-2, device='cuda', **kwargs
-):
-    """Probability flow ODE sampler with the black-box ODE solver.
-    Args:
-        sde: An `sdes.SDE` object representing the forward SDE.
-        score_fn: A function (typically learned model) that predicts the score.
-        y: A `torch.Tensor`, representing the (non-white-)noisy starting point(s) to condition the prior on.
-        inverse_scaler: The inverse data normalizer.
-        denoise: If `True`, add one-step denoising to final samples.
-        rtol: A `float` number. The relative tolerance level of the ODE solver.
-        atol: A `float` number. The absolute tolerance level of the ODE solver.
-        method: A `str`. The algorithm used for the black-box ODE solver.
-            See the documentation of `scipy.integrate.solve_ivp`.
-        eps: A `float` number. The reverse-time SDE/ODE will be integrated to `eps` for numerical stability.
-        device: PyTorch device.
-    Returns:
-        A sampling function that returns samples and the number of function evaluations during sampling.
-    """
-    predictor = ReverseDiffusionPredictor(sde, score_fn, probability_flow=False)
-    rsde = sde.reverse(score_fn, probability_flow=True)
-    def denoise_update_fn(x):
-        vec_eps = torch.ones(x.shape[0], device=x.device) * eps
-        _, x = predictor.update_fn(x, y, vec_eps)
-        return x
-    def drift_fn(x, y, t):
-        """Get the drift function of the reverse-time SDE."""
-        return rsde.sde(x, y, t)[0]
-    def ode_sampler(z=None, **kwargs):
-        """The probability flow ODE sampler with black-box ODE solver.
-        Args:
-            model: A score model.
-            z: If present, generate samples from latent code `z`.
-        Returns:
-            samples, number of function evaluations.
-        """
-        with torch.no_grad():
-            # If not represent, sample the latent code from the prior distibution of the SDE.
-            x = sde.prior_sampling(y.shape, y).to(device)
-            def ode_func(t, x):
-                x = from_flattened_numpy(x, y.shape).to(device).type(torch.complex64)
-                vec_t = torch.ones(y.shape[0], device=x.device) * t
-                drift = drift_fn(x, y, vec_t)
-                return to_flattened_numpy(drift)
-            # Black-box ODE solver for the probability flow ODE
-            solution = integrate.solve_ivp(
-                ode_func, (sde.T, eps), to_flattened_numpy(x),
-                rtol=rtol, atol=atol, method=method, **kwargs
-            )
-            nfe = solution.nfev
-            x = torch.tensor(solution.y[:, -1]).reshape(y.shape).to(device).type(torch.complex64)
-            # Denoising is equivalent to running one predictor step without adding noise
-            if denoise:
-                x = denoise_update_fn(x)
-            if inverse_scaler is not None:
-                x = inverse_scaler(x)
-            return x, nfe
-    return ode_sampler
-def get_sb_sampler(sde, model, y, eps=1e-4, n_steps=50, sampler_type="ode", **kwargs):
-    # adapted from https://github.com/NVIDIA/NeMo/blob/78357ae99ff2cf9f179f53fbcb02c88a5a67defb/nemo/collections/audio/parts/submodules/schroedinger_bridge.py#L382
-    def sde_sampler():
-        """The SB-SDE sampler function."""
-        with torch.no_grad():
-            xt = y[:, [0], :, :] # special case for storm_2ch
-            time_steps = torch.linspace(sde.T, eps, sde.N + 1, device=y.device)
-            # Initial values
-            time_prev = time_steps[0] * torch.ones(xt.shape[0], device=xt.device)
-            sigma_prev, sigma_T, sigma_bar_prev, alpha_prev, alpha_T, alpha_bar_prev = sde._sigmas_alphas(time_prev)
-            for t in time_steps[1:]:
-                # Prepare time steps for the whole batch
-                time = t * torch.ones(xt.shape[0], device=xt.device)
-                # Get noise schedule for current time
-                sigma_t, sigma_T, sigma_bart, alpha_t, alpha_T, alpha_bart = sde._sigmas_alphas(time)
-                # Run DNN
-                current_estimate = model(xt, y, time)
-                # Calculate scaling for the first-order discretization from the paper
-                weight_prev = alpha_t * sigma_t**2 / (alpha_prev * sigma_prev**2 + sde.eps)
-                tmp = 1 - sigma_t**2 / (sigma_prev**2 + sde.eps)
-                weight_estimate = alpha_t * tmp
-                weight_z = alpha_t * sigma_t * torch.sqrt(tmp)
-                # View as [B, C, D, T]
-                weight_prev = weight_prev[:, None, None, None]
-                weight_estimate = weight_estimate[:, None, None, None]
-                weight_z = weight_z[:, None, None, None]
-                # Random sample
-                z_norm = torch.randn_like(xt)
-                if t == time_steps[-1]:
-                    weight_z = 0.0
-                # Update state: weighted sum of previous state, current estimate and noise
-                xt = weight_prev * xt + weight_estimate * current_estimate + weight_z * z_norm
-                # Save previous values
-                time_prev = time
-                alpha_prev = alpha_t
-                sigma_prev = sigma_t
-                sigma_bar_prev = sigma_bart
-            return xt, n_steps
-    def ode_sampler():
-        """The SB-ODE sampler function."""
-        with torch.no_grad():
-            xt = y
-            time_steps = torch.linspace(sde.T, eps, sde.N + 1, device=y.device)
-            # Initial values
-            time_prev = time_steps[0] * torch.ones(xt.shape[0], device=xt.device)
-            sigma_prev, sigma_T, sigma_bar_prev, alpha_prev, alpha_T, alpha_bar_prev = sde._sigmas_alphas(time_prev)
-            for t in time_steps[1:]:
-                # Prepare time steps for the whole batch
-                time = t * torch.ones(xt.shape[0], device=xt.device)
-                # Get noise schedule for current time
-                sigma_t, sigma_T, sigma_bart, alpha_t, alpha_T, alpha_bart = sde._sigmas_alphas(time)
-                # Run DNN
-                current_estimate = model(xt, y, time)
-                # Calculate scaling for the first-order discretization from the paper
-                weight_prev = alpha_t * sigma_t * sigma_bart / (alpha_prev * sigma_prev * sigma_bar_prev + sde.eps)
-                weight_estimate = (
-                    alpha_t
-                    / (sigma_T**2 + sde.eps)
-                    * (sigma_bart**2 - sigma_bar_prev * sigma_t * sigma_bart / (sigma_prev + sde.eps))
-                )
-                weight_prior_mean = (
-                    alpha_t
-                    / (alpha_T * sigma_T**2 + sde.eps)
-                    * (sigma_t**2 - sigma_prev * sigma_t * sigma_bart / (sigma_bar_prev + sde.eps))
-                )
-                # View as [B, C, D, T]
-                weight_prev = weight_prev[:, None, None, None]
-                weight_estimate = weight_estimate[:, None, None, None]
-                weight_prior_mean = weight_prior_mean[:, None, None, None]
-                # Update state: weighted sum of previous state, current estimate and prior
-                xt = weight_prev * xt + weight_estimate * current_estimate + weight_prior_mean * y
-                # Save previous values
-                time_prev = time
-                alpha_prev = alpha_t
-                sigma_prev = sigma_t
-                sigma_bar_prev = sigma_bart
-            return xt, n_steps
-    if sampler_type == "sde":
-        return sde_sampler
-    elif sampler_type == "ode":
-        return ode_sampler
-    else:
-        raise ValueError("Invalid type. Choose 'ode' or 'sde'.")

+# Adapted from https://github.com/yang-song/score_sde_pytorch/blob/1618ddea340f3e4a2ed7852a0694a809775cf8d0/sampling.py
+"""Various sampling methods."""
+from scipy import integrate
+import torch
+from .predictors import Predictor, PredictorRegistry, ReverseDiffusionPredictor
+from .correctors import Corrector, CorrectorRegistry
+__all__ = [
+    'PredictorRegistry', 'CorrectorRegistry', 'Predictor', 'Corrector',
+    'get_sampler'
+]
+def to_flattened_numpy(x):
+    """Flatten a torch tensor `x` and convert it to numpy."""
+    return x.detach().cpu().numpy().reshape((-1,))
+def from_flattened_numpy(x, shape):
+    """Form a torch tensor with the given `shape` from a flattened numpy array `x`."""
+    return torch.from_numpy(x.reshape(shape))
+def get_pc_sampler(
+    predictor_name, corrector_name, sde, score_fn, y,
+    denoise=True, eps=3e-2, snr=0.1, corrector_steps=1, probability_flow: bool = False,
+    intermediate=False, **kwargs
+):
+    """Create a Predictor-Corrector (PC) sampler.
+    Args:
+        predictor_name: The name of a registered `sampling.Predictor`.
+        corrector_name: The name of a registered `sampling.Corrector`.
+        sde: An `sdes.SDE` object representing the forward SDE.
+        score_fn: A function (typically learned model) that predicts the score.
+        y: A `torch.Tensor`, representing the (non-white-)noisy starting point(s) to condition the prior on.
+        denoise: If `True`, add one-step denoising to the final samples.
+        eps: A `float` number. The reverse-time SDE and ODE are integrated to `epsilon` to avoid numerical issues.
+        snr: The SNR to use for the corrector. 0.1 by default, and ignored for `NoneCorrector`.
+        N: The number of reverse sampling steps. If `None`, uses the SDE's `N` property by default.
+    Returns:
+        A sampling function that returns samples and the number of function evaluations during sampling.
+    """
+    predictor_cls = PredictorRegistry.get_by_name(predictor_name)
+    corrector_cls = CorrectorRegistry.get_by_name(corrector_name)
+    predictor = predictor_cls(sde, score_fn, probability_flow=probability_flow)
+    corrector = corrector_cls(sde, score_fn, snr=snr, n_steps=corrector_steps)
+    def pc_sampler():
+        """The PC sampler function."""
+        with torch.no_grad():
+            xt = sde.prior_sampling(y.shape, y).to(y.device)
+            timesteps = torch.linspace(sde.T, eps, sde.N, device=y.device)
+            for i in range(sde.N):
+                t = timesteps[i]
+                if i != len(timesteps) - 1:
+                    stepsize = t - timesteps[i+1]
+                else:
+                    stepsize = timesteps[-1] # from eps to 0
+                vec_t = torch.ones(y.shape[0], device=y.device) * t
+                xt, xt_mean = corrector.update_fn(xt, y, vec_t)
+                xt, xt_mean = predictor.update_fn(xt, y, vec_t, stepsize)
+            x_result = xt_mean if denoise else xt
+            ns = sde.N * (corrector.n_steps + 1)
+            return x_result, ns
+    return pc_sampler
+def get_ode_sampler(
+    sde, score_fn, y, inverse_scaler=None,
+    denoise=True, rtol=1e-5, atol=1e-5,
+    method='RK45', eps=3e-2, device='cuda', **kwargs
+):
+    """Probability flow ODE sampler with the black-box ODE solver.
+    Args:
+        sde: An `sdes.SDE` object representing the forward SDE.
+        score_fn: A function (typically learned model) that predicts the score.
+        y: A `torch.Tensor`, representing the (non-white-)noisy starting point(s) to condition the prior on.
+        inverse_scaler: The inverse data normalizer.
+        denoise: If `True`, add one-step denoising to final samples.
+        rtol: A `float` number. The relative tolerance level of the ODE solver.
+        atol: A `float` number. The absolute tolerance level of the ODE solver.
+        method: A `str`. The algorithm used for the black-box ODE solver.
+            See the documentation of `scipy.integrate.solve_ivp`.
+        eps: A `float` number. The reverse-time SDE/ODE will be integrated to `eps` for numerical stability.
+        device: PyTorch device.
+    Returns:
+        A sampling function that returns samples and the number of function evaluations during sampling.
+    """
+    predictor = ReverseDiffusionPredictor(sde, score_fn, probability_flow=False)
+    rsde = sde.reverse(score_fn, probability_flow=True)
+    def denoise_update_fn(x):
+        vec_eps = torch.ones(x.shape[0], device=x.device) * eps
+        _, x = predictor.update_fn(x, y, vec_eps)
+        return x
+    def drift_fn(x, y, t):
+        """Get the drift function of the reverse-time SDE."""
+        return rsde.sde(x, y, t)[0]
+    def ode_sampler(z=None, **kwargs):
+        """The probability flow ODE sampler with black-box ODE solver.
+        Args:
+            model: A score model.
+            z: If present, generate samples from latent code `z`.
+        Returns:
+            samples, number of function evaluations.
+        """
+        with torch.no_grad():
+            # If not represent, sample the latent code from the prior distibution of the SDE.
+            x = sde.prior_sampling(y.shape, y).to(device)
+            def ode_func(t, x):
+                x = from_flattened_numpy(x, y.shape).to(device).type(torch.complex64)
+                vec_t = torch.ones(y.shape[0], device=x.device) * t
+                drift = drift_fn(x, y, vec_t)
+                return to_flattened_numpy(drift)
+            # Black-box ODE solver for the probability flow ODE
+            solution = integrate.solve_ivp(
+                ode_func, (sde.T, eps), to_flattened_numpy(x),
+                rtol=rtol, atol=atol, method=method, **kwargs
+            )
+            nfe = solution.nfev
+            x = torch.tensor(solution.y[:, -1]).reshape(y.shape).to(device).type(torch.complex64)
+            # Denoising is equivalent to running one predictor step without adding noise
+            if denoise:
+                x = denoise_update_fn(x)
+            if inverse_scaler is not None:
+                x = inverse_scaler(x)
+            return x, nfe
+    return ode_sampler
+def get_sb_sampler(sde, model, y, eps=1e-4, n_steps=50, sampler_type="ode", **kwargs):
+    # adapted from https://github.com/NVIDIA/NeMo/blob/78357ae99ff2cf9f179f53fbcb02c88a5a67defb/nemo/collections/audio/parts/submodules/schroedinger_bridge.py#L382
+    def sde_sampler():
+        """The SB-SDE sampler function."""
+        with torch.no_grad():
+            xt = y[:, [0], :, :] # special case for storm_2ch
+            time_steps = torch.linspace(sde.T, eps, sde.N + 1, device=y.device)
+            # Initial values
+            time_prev = time_steps[0] * torch.ones(xt.shape[0], device=xt.device)
+            sigma_prev, sigma_T, sigma_bar_prev, alpha_prev, alpha_T, alpha_bar_prev = sde._sigmas_alphas(time_prev)
+            for t in time_steps[1:]:
+                # Prepare time steps for the whole batch
+                time = t * torch.ones(xt.shape[0], device=xt.device)
+                # Get noise schedule for current time
+                sigma_t, sigma_T, sigma_bart, alpha_t, alpha_T, alpha_bart = sde._sigmas_alphas(time)
+                # Run DNN
+                current_estimate = model(xt, y, time)
+                # Calculate scaling for the first-order discretization from the paper
+                weight_prev = alpha_t * sigma_t**2 / (alpha_prev * sigma_prev**2 + sde.eps)
+                tmp = 1 - sigma_t**2 / (sigma_prev**2 + sde.eps)
+                weight_estimate = alpha_t * tmp
+                weight_z = alpha_t * sigma_t * torch.sqrt(tmp)
+                # View as [B, C, D, T]
+                weight_prev = weight_prev[:, None, None, None]
+                weight_estimate = weight_estimate[:, None, None, None]
+                weight_z = weight_z[:, None, None, None]
+                # Random sample
+                z_norm = torch.randn_like(xt)
+                if t == time_steps[-1]:
+                    weight_z = 0.0
+                # Update state: weighted sum of previous state, current estimate and noise
+                xt = weight_prev * xt + weight_estimate * current_estimate + weight_z * z_norm
+                # Save previous values
+                time_prev = time
+                alpha_prev = alpha_t
+                sigma_prev = sigma_t
+                sigma_bar_prev = sigma_bart
+            return xt, n_steps
+    def ode_sampler():
+        """The SB-ODE sampler function."""
+        with torch.no_grad():
+            xt = y
+            time_steps = torch.linspace(sde.T, eps, sde.N + 1, device=y.device)
+            # Initial values
+            time_prev = time_steps[0] * torch.ones(xt.shape[0], device=xt.device)
+            sigma_prev, sigma_T, sigma_bar_prev, alpha_prev, alpha_T, alpha_bar_prev = sde._sigmas_alphas(time_prev)
+            for t in time_steps[1:]:
+                # Prepare time steps for the whole batch
+                time = t * torch.ones(xt.shape[0], device=xt.device)
+                # Get noise schedule for current time
+                sigma_t, sigma_T, sigma_bart, alpha_t, alpha_T, alpha_bart = sde._sigmas_alphas(time)
+                # Run DNN
+                current_estimate = model(xt, y, time)
+                # Calculate scaling for the first-order discretization from the paper
+                weight_prev = alpha_t * sigma_t * sigma_bart / (alpha_prev * sigma_prev * sigma_bar_prev + sde.eps)
+                weight_estimate = (
+                    alpha_t
+                    / (sigma_T**2 + sde.eps)
+                    * (sigma_bart**2 - sigma_bar_prev * sigma_t * sigma_bart / (sigma_prev + sde.eps))
+                )
+                weight_prior_mean = (
+                    alpha_t
+                    / (alpha_T * sigma_T**2 + sde.eps)
+                    * (sigma_t**2 - sigma_prev * sigma_t * sigma_bart / (sigma_bar_prev + sde.eps))
+                )
+                # View as [B, C, D, T]
+                weight_prev = weight_prev[:, None, None, None]
+                weight_estimate = weight_estimate[:, None, None, None]
+                weight_prior_mean = weight_prior_mean[:, None, None, None]
+                # Update state: weighted sum of previous state, current estimate and prior
+                xt = weight_prev * xt + weight_estimate * current_estimate + weight_prior_mean * y
+                # Save previous values
+                time_prev = time
+                alpha_prev = alpha_t
+                sigma_prev = sigma_t
+                sigma_bar_prev = sigma_bart
+            return xt, n_steps
+    if sampler_type == "sde":
+        return sde_sampler
+    elif sampler_type == "ode":
+        return ode_sampler
+    else:
+        raise ValueError("Invalid type. Choose 'ode' or 'sde'.")

sgmse/sampling/correctors.py CHANGED Viewed

@@ -1,94 +1,96 @@
-import abc
-import torch
-from sgmse import sdes
-from sgmse.util.registry import Registry
-CorrectorRegistry = Registry("Corrector")
-class Corrector(abc.ABC):
-    """The abstract class for a corrector algorithm."""
-    def __init__(self, sde, score_fn, snr, n_steps):
-        super().__init__()
-        self.rsde = sde.reverse(score_fn)
-        self.score_fn = score_fn
-        self.snr = snr
-        self.n_steps = n_steps
-    @abc.abstractmethod
-    def update_fn(self, x, y, t, *args):
-        """One update of the corrector.
-        Args:
-            x: A PyTorch tensor representing the current state
-            t: A PyTorch tensor representing the current time step.
-            *args: Possibly additional arguments, in particular `y` for OU processes
-        Returns:
-            x: A PyTorch tensor of the next state.
-            x_mean: A PyTorch tensor. The next state without random noise. Useful for denoising.
-        """
-        pass
-@CorrectorRegistry.register(name='langevin')
-class LangevinCorrector(Corrector):
-    def __init__(self, sde, score_fn, snr, n_steps):
-        super().__init__(sde, score_fn, snr, n_steps)
-        self.score_fn = score_fn
-        self.n_steps = n_steps
-        self.snr = snr
-    def update_fn(self, x, y, t, *args):
-        target_snr = self.snr
-        for _ in range(self.n_steps):
-            grad = self.score_fn(x, y, t, *args)
-            noise = torch.randn_like(x)
-            grad_norm = torch.norm(grad.reshape(grad.shape[0], -1), dim=-1).mean()
-            noise_norm = torch.norm(noise.reshape(noise.shape[0], -1), dim=-1).mean()
-            step_size = ((target_snr * noise_norm / grad_norm) ** 2 * 2).unsqueeze(0)
-            x_mean = x + step_size[:, None, None, None] * grad
-            x = x_mean + noise * torch.sqrt(step_size * 2)[:, None, None, None]
-        return x, x_mean
-@CorrectorRegistry.register(name='ald')
-class AnnealedLangevinDynamics(Corrector):
-    """The original annealed Langevin dynamics predictor in NCSN/NCSNv2."""
-    def __init__(self, sde, score_fn, snr, n_steps):
-        super().__init__(sde, score_fn, snr, n_steps)
-        self.sde = sde
-        self.score_fn = score_fn
-        self.snr = snr
-        self.n_steps = n_steps
-    def update_fn(self, x, y, t, *args):
-        n_steps = self.n_steps
-        target_snr = self.snr
-        std = self.sde.marginal_prob(x, y, t, *args)[1]
-        for _ in range(n_steps):
-            grad = self.score_fn(x, y, t, *args)
-            noise = torch.randn_like(x)
-            step_size = (target_snr * std) ** 2 * 2
-            x_mean = x + step_size[:, None, None, None] * grad
-            x = x_mean + noise * torch.sqrt(step_size * 2)[:, None, None, None]
-        return x, x_mean
-@CorrectorRegistry.register(name='none')
-class NoneCorrector(Corrector):
-    """An empty corrector that does nothing."""
-    def __init__(self, *args, **kwargs):
-        self.snr = 0
-        self.n_steps = 0
-        pass
-    def update_fn(self, x, t, *args):
-        return x, x

+import abc
+import torch
+from sgmse import sdes
+from sgmse.util.registry import Registry
+CorrectorRegistry = Registry("Corrector")
+class Corrector(abc.ABC):
+    """The abstract class for a corrector algorithm."""
+    def __init__(self, sde, score_fn, snr, n_steps):
+        super().__init__()
+        self.rsde = sde.reverse(score_fn)
+        self.score_fn = score_fn
+        self.snr = snr
+        self.n_steps = n_steps
+    @abc.abstractmethod
+    def update_fn(self, x, t, *args):
+        """One update of the corrector.
+        Args:
+            x: A PyTorch tensor representing the current state
+            t: A PyTorch tensor representing the current time step.
+            *args: Possibly additional arguments, in particular `y` for OU processes
+        Returns:
+            x: A PyTorch tensor of the next state.
+            x_mean: A PyTorch tensor. The next state without random noise. Useful for denoising.
+        """
+        pass
+@CorrectorRegistry.register(name='langevin')
+class LangevinCorrector(Corrector):
+    def __init__(self, sde, score_fn, snr, n_steps):
+        super().__init__(sde, score_fn, snr, n_steps)
+        self.score_fn = score_fn
+        self.n_steps = n_steps
+        self.snr = snr
+    def update_fn(self, x, t, *args):
+        target_snr = self.snr
+        for _ in range(self.n_steps):
+            grad = self.score_fn(x, t, *args)
+            noise = torch.randn_like(x)
+            grad_norm = torch.norm(grad.reshape(grad.shape[0], -1), dim=-1).mean()
+            noise_norm = torch.norm(noise.reshape(noise.shape[0], -1), dim=-1).mean()
+            step_size = ((target_snr * noise_norm / grad_norm) ** 2 * 2).unsqueeze(0)
+            x_mean = x + step_size[:, None, None, None] * grad
+            x = x_mean + noise * torch.sqrt(step_size * 2)[:, None, None, None]
+        return x, x_mean
+@CorrectorRegistry.register(name='ald')
+class AnnealedLangevinDynamics(Corrector):
+    """The original annealed Langevin dynamics predictor in NCSN/NCSNv2."""
+    def __init__(self, sde, score_fn, snr, n_steps):
+        super().__init__(sde, score_fn, snr, n_steps)
+        if not isinstance(sde, (sdes.OUVESDE,)):
+            raise NotImplementedError(f"SDE class {sde.__class__.__name__} not yet supported.")
+        self.sde = sde
+        self.score_fn = score_fn
+        self.snr = snr
+        self.n_steps = n_steps
+    def update_fn(self, x, t, *args):
+        n_steps = self.n_steps
+        target_snr = self.snr
+        std = self.sde.marginal_prob(x, t, *args)[1]
+        for _ in range(n_steps):
+            grad = self.score_fn(x, t, *args)
+            noise = torch.randn_like(x)
+            step_size = (target_snr * std) ** 2 * 2
+            x_mean = x + step_size[:, None, None, None] * grad
+            x = x_mean + noise * torch.sqrt(step_size * 2)[:, None, None, None]
+        return x, x_mean
+@CorrectorRegistry.register(name='none')
+class NoneCorrector(Corrector):
+    """An empty corrector that does nothing."""
+    def __init__(self, *args, **kwargs):
+        self.snr = 0
+        self.n_steps = 0
+        pass
+    def update_fn(self, x, t, *args):
+        return x, x

sgmse/sampling/predictors.py CHANGED Viewed

@@ -1,76 +1,76 @@
-import abc
-import torch
-import numpy as np
-from sgmse.util.registry import Registry
-PredictorRegistry = Registry("Predictor")
-class Predictor(abc.ABC):
-    """The abstract class for a predictor algorithm."""
-    def __init__(self, sde, score_fn, probability_flow=False):
-        super().__init__()
-        self.sde = sde
-        self.rsde = sde.reverse(score_fn)
-        self.score_fn = score_fn
-        self.probability_flow = probability_flow
-    @abc.abstractmethod
-    def update_fn(self, x, t, *args):
-        """One update of the predictor.
-        Args:
-            x: A PyTorch tensor representing the current state
-            t: A Pytorch tensor representing the current time step.
-            *args: Possibly additional arguments, in particular `y` for OU processes
-        Returns:
-            x: A PyTorch tensor of the next state.
-            x_mean: A PyTorch tensor. The next state without random noise. Useful for denoising.
-        """
-        pass
-    def debug_update_fn(self, x, t, *args):
-        raise NotImplementedError(f"Debug update function not implemented for predictor {self}.")
-@PredictorRegistry.register('euler_maruyama')
-class EulerMaruyamaPredictor(Predictor):
-    def __init__(self, sde, score_fn, probability_flow=False):
-        super().__init__(sde, score_fn, probability_flow=probability_flow)
-    def update_fn(self, x, y, t, *args):
-        dt = -1. / self.rsde.N
-        z = torch.randn_like(x)
-        f, g = self.rsde.sde(x, y, t, *args)
-        x_mean = x + f * dt
-        x = x_mean + g[:, None, None, None] * np.sqrt(-dt) * z
-        return x, x_mean
-@PredictorRegistry.register('reverse_diffusion')
-class ReverseDiffusionPredictor(Predictor):
-    def __init__(self, sde, score_fn, probability_flow=False):
-        super().__init__(sde, score_fn, probability_flow=probability_flow)
-    def update_fn(self, x, y, t, stepsize):
-        f, g = self.rsde.discretize(x, y, t, stepsize)
-        z = torch.randn_like(x)
-        x_mean = x - f
-        x = x_mean + g[:, None, None, None] * z
-        return x, x_mean
-@PredictorRegistry.register('none')
-class NonePredictor(Predictor):
-    """An empty predictor that does nothing."""
-    def __init__(self, *args, **kwargs):
-        pass
-    def update_fn(self, x, y, t, *args):
-        return x, x

+import abc
+import torch
+import numpy as np
+from sgmse.util.registry import Registry
+PredictorRegistry = Registry("Predictor")
+class Predictor(abc.ABC):
+    """The abstract class for a predictor algorithm."""
+    def __init__(self, sde, score_fn, probability_flow=False):
+        super().__init__()
+        self.sde = sde
+        self.rsde = sde.reverse(score_fn)
+        self.score_fn = score_fn
+        self.probability_flow = probability_flow
+    @abc.abstractmethod
+    def update_fn(self, x, t, *args):
+        """One update of the predictor.
+        Args:
+            x: A PyTorch tensor representing the current state
+            t: A Pytorch tensor representing the current time step.
+            *args: Possibly additional arguments, in particular `y` for OU processes
+        Returns:
+            x: A PyTorch tensor of the next state.
+            x_mean: A PyTorch tensor. The next state without random noise. Useful for denoising.
+        """
+        pass
+    def debug_update_fn(self, x, t, *args):
+        raise NotImplementedError(f"Debug update function not implemented for predictor {self}.")
+@PredictorRegistry.register('euler_maruyama')
+class EulerMaruyamaPredictor(Predictor):
+    def __init__(self, sde, score_fn, probability_flow=False):
+        super().__init__(sde, score_fn, probability_flow=probability_flow)
+    def update_fn(self, x, t, *args):
+        dt = -1. / self.rsde.N
+        z = torch.randn_like(x)
+        f, g = self.rsde.sde(x, t, *args)
+        x_mean = x + f * dt
+        x = x_mean + g[:, None, None, None] * np.sqrt(-dt) * z
+        return x, x_mean
+@PredictorRegistry.register('reverse_diffusion')
+class ReverseDiffusionPredictor(Predictor):
+    def __init__(self, sde, score_fn, probability_flow=False):
+        super().__init__(sde, score_fn, probability_flow=probability_flow)
+    def update_fn(self, x, t, y, stepsize):
+        f, g = self.rsde.discretize(x, t, y, stepsize)
+        z = torch.randn_like(x)
+        x_mean = x - f
+        x = x_mean + g[:, None, None, None] * z
+        return x, x_mean
+@PredictorRegistry.register('none')
+class NonePredictor(Predictor):
+    """An empty predictor that does nothing."""
+    def __init__(self, *args, **kwargs):
+        pass
+    def update_fn(self, x, t, *args):
+        return x, x

sgmse/sdes.py CHANGED Viewed

@@ -1,313 +1,392 @@
-"""
-Abstract SDE classes, Reverse SDE, and VE/VP SDEs.
-Taken and adapted from https://github.com/yang-song/score_sde_pytorch/blob/1618ddea340f3e4a2ed7852a0694a809775cf8d0/sde_lib.py
-"""
-import abc
-import warnings
-import numpy as np
-from sgmse.util.tensors import batch_broadcast
-import torch
-from sgmse.util.registry import Registry
-SDERegistry = Registry("SDE")
-class SDE(abc.ABC):
-    """SDE abstract class. Functions are designed for a mini-batch of inputs."""
-    def __init__(self, N):
-        """Construct an SDE.
-        Args:
-            N: number of discretization time steps.
-        """
-        super().__init__()
-        self.N = N
-    @property
-    @abc.abstractmethod
-    def T(self):
-        """End time of the SDE."""
-        pass
-    @abc.abstractmethod
-    def sde(self, x, y, t, *args):
-        pass
-    @abc.abstractmethod
-    def marginal_prob(self, x, y, t, *args):
-        """Parameters to determine the marginal distribution of the SDE, $p_t(x|args)$."""
-        pass
-    @abc.abstractmethod
-    def prior_sampling(self, shape, *args):
-        """Generate one sample from the prior distribution, $p_T(x|args)$ with shape `shape`."""
-        pass
-    @abc.abstractmethod
-    def prior_logp(self, z):
-        """Compute log-density of the prior distribution.
-        Useful for computing the log-likelihood via probability flow ODE.
-        Args:
-            z: latent code
-        Returns:
-            log probability density
-        """
-        pass
-    @staticmethod
-    @abc.abstractmethod
-    def add_argparse_args(parent_parser):
-        """
-        Add the necessary arguments for instantiation of this SDE class to an argparse ArgumentParser.
-        """
-        pass
-    def discretize(self, x, y, t, stepsize):
-        """Discretize the SDE in the form: x_{i+1} = x_i + f_i(x_i) + G_i z_i.
-        Useful for reverse diffusion sampling and probabiliy flow sampling.
-        Defaults to Euler-Maruyama discretization.
-        Args:
-            x: a torch tensor
-            t: a torch float representing the time step (from 0 to `self.T`)
-        Returns:
-            f, G
-        """
-        dt = stepsize
-        drift, diffusion = self.sde(x, y, t)
-        f = drift * dt
-        G = diffusion * torch.sqrt(dt)
-        return f, G
-    def reverse(oself, score_model, probability_flow=False):
-        """Create the reverse-time SDE/ODE.
-        Args:
-            score_model: A function that takes x, t and y and returns the score.
-            probability_flow: If `True`, create the reverse-time ODE used for probability flow sampling.
-        """
-        N = oself.N
-        T = oself.T
-        sde_fn = oself.sde
-        discretize_fn = oself.discretize
-        # Build the class for reverse-time SDE.
-        class RSDE(oself.__class__):
-            def __init__(self):
-                self.N = N
-                self.probability_flow = probability_flow
-            @property
-            def T(self):
-                return T
-            def sde(self, x, y, t, *args):
-                """Create the drift and diffusion functions for the reverse SDE/ODE."""
-                rsde_parts = self.rsde_parts(x, y, t, *args)
-                total_drift, diffusion = rsde_parts["total_drift"], rsde_parts["diffusion"]
-                return total_drift, diffusion
-            def rsde_parts(self, x, y, t, *args):
-                sde_drift, sde_diffusion = sde_fn(x, y, t, *args)
-                score = score_model(x, y, t, *args)
-                score_drift = -sde_diffusion[:, None, None, None]**2 * score * (0.5 if self.probability_flow else 1.)
-                diffusion = torch.zeros_like(sde_diffusion) if self.probability_flow else sde_diffusion
-                total_drift = sde_drift + score_drift
-                return {
-                    'total_drift': total_drift, 'diffusion': diffusion, 'sde_drift': sde_drift,
-                    'sde_diffusion': sde_diffusion, 'score_drift': score_drift, 'score': score,
-                }
-            def discretize(self, x, y, t, stepsize):
-                """Create discretized iteration rules for the reverse diffusion sampler."""
-                f, G = discretize_fn(x, y, t, stepsize)
-                rev_f = f - G[:, None, None, None] ** 2 * score_model(x, y, t) * (0.5 if self.probability_flow else 1.)
-                rev_G = torch.zeros_like(G) if self.probability_flow else G
-                return rev_f, rev_G
-        return RSDE()
-    @abc.abstractmethod
-    def copy(self):
-        pass
-@SDERegistry.register("ouve")
-class OUVESDE(SDE):
-    @staticmethod
-    def add_argparse_args(parser):
-        parser.add_argument("--theta", type=float, default=1.5, help="The constant stiffness of the Ornstein-Uhlenbeck process. 1.5 by default.")
-        parser.add_argument("--sigma-min", type=float, default=0.05, help="The minimum sigma to use. 0.05 by default.")
-        parser.add_argument("--sigma-max", type=float, default=0.5, help="The maximum sigma to use. 0.5 by default.")
-        parser.add_argument("--N", type=int, default=30, help="The number of timesteps in the SDE discretization. 30 by default")
-        parser.add_argument("--sampler_type", type=str, default="pc", help="Type of sampler to use. 'pc' by default.")
-        return parser
-    def __init__(self, theta, sigma_min, sigma_max, N=30, sampler_type="pc", **ignored_kwargs):
-        """Construct an Ornstein-Uhlenbeck Variance Exploding SDE.
-        Note that the "steady-state mean" `y` is not provided at construction, but must rather be given as an argument
-        to the methods which require it (e.g., `sde` or `marginal_prob`).
-        dx = -theta (y-x) dt + sigma(t) dw
-        with
-        sigma(t) = sigma_min (sigma_max/sigma_min)^t * sqrt(2 log(sigma_max/sigma_min))
-        Args:
-            theta: stiffness parameter.
-            sigma_min: smallest sigma.
-            sigma_max: largest sigma.
-            N: number of discretization steps
-        """
-        super().__init__(N)
-        self.theta = theta
-        self.sigma_min = sigma_min
-        self.sigma_max = sigma_max
-        self.logsig = np.log(self.sigma_max / self.sigma_min)
-        self.N = N
-        self.sampler_type = sampler_type
-    def copy(self):
-        return OUVESDE(self.theta, self.sigma_min, self.sigma_max, N=self.N, sampler_type=self.sampler_type)
-    @property
-    def T(self):
-        return 1
-    def sde(self, x, y, t):
-        drift = self.theta * (y - x)
-        # the sqrt(2*logsig) factor is required here so that logsig does not in the end affect the perturbation kernel
-        # standard deviation. this can be understood from solving the integral of [exp(2s) * g(s)^2] from s=0 to t
-        # with g(t) = sigma(t) as defined here, and seeing that `logsig` remains in the integral solution
-        # unless this sqrt(2*logsig) factor is included.
-        sigma = self.sigma_min * (self.sigma_max / self.sigma_min) ** t
-        diffusion = sigma * np.sqrt(2 * self.logsig)
-        return drift, diffusion
-    def _mean(self, x0, y, t):
-        theta = self.theta
-        exp_interp = torch.exp(-theta * t)[:, None, None, None]
-        return exp_interp * x0 + (1 - exp_interp) * y
-    def alpha(self, t):
-        return torch.exp(-self.theta * t)
-    def _std(self, t):
-        # This is a full solution to the ODE for P(t) in our derivations, after choosing g(s) as in self.sde()
-        sigma_min, theta, logsig = self.sigma_min, self.theta, self.logsig
-        # could maybe replace the two torch.exp(... * t) terms here by cached values **t
-        return torch.sqrt(
-            (
-                sigma_min**2
-                * torch.exp(-2 * theta * t)
-                * (torch.exp(2 * (theta + logsig) * t) - 1)
-                * logsig
-            )
-            /
-            (theta + logsig)
-        )
-    def marginal_prob(self, x0, y, t):
-        return self._mean(x0, y, t), self._std(t)
-    def prior_sampling(self, shape, y):
-        if shape != y.shape:
-            warnings.warn(f"Target shape {shape} does not match shape of y {y.shape}! Ignoring target shape.")
-        std = self._std(torch.ones((y.shape[0],), device=y.device))
-        x_T = y + torch.randn_like(y) * std[:, None, None, None]
-        return x_T
-    def prior_logp(self, z):
-        raise NotImplementedError("prior_logp for OU SDE not yet implemented!")
-@SDERegistry.register("sbve")
-class SBVESDE(SDE):
-    @staticmethod
-    def add_argparse_args(parser):
-        parser.add_argument("--N", type=int, default=50, help="The number of timesteps in the SDE discretization. 50 by default")
-        parser.add_argument("--k", type=float, default=2.6, help="Parameter of the diffusion coefficient. 2.6 by default.")
-        parser.add_argument("--c", type=float, default=0.4, help="Parameter of the diffusion coefficient. 0.4 by default.")
-        parser.add_argument("--eps", type=float, default=1e-8, help="Small constant to avoid numerical instability. 1e-8 by default.")
-        parser.add_argument("--sampler_type", type=str, default="ode")
-        return parser
-    def __init__(self, k, c, N=50, eps=1e-8, sampler_type="ode", **ignored_kwargs):
-        """Construct a Schrodinger Bridge with Variance Exploding SDE.
-        As described in Jukić et al., „Schrödinger Bridge for Generative Speech Enhancement“, 2024.
-        Args:
-            k: stiffness parameter.
-            c: diffusion parameter.
-            N: number of discretization steps
-        """
-        super().__init__(N)
-        self.k = k
-        self.c = c
-        self.N = N
-        self.eps = eps
-        self.sampler_type = sampler_type
-    def copy(self):
-        return SBVESDE(self.k, self.c, N=self.N)
-    @property
-    def T(self):
-        return 1
-    def sde(self, x, y, t):
-        f = 0.0                                                                 # Table 1
-        g = torch.sqrt(torch.tensor(self.c)) * self.k**(t)                      # Table 1
-        return f, g
-    def _sigmas_alphas(self, t):
-        alpha_t = torch.ones_like(t)
-        alpha_T = torch.ones_like(t)
-        sigma_t = torch.sqrt((self.c*(self.k**(2*t)-1.0)) \
-            / (2*torch.log(torch.tensor(self.k))))                              # Table 1
-        sigma_T = torch.sqrt((self.c*(self.k**(2*self.T)-1.0)) \
-            / (2*torch.log(torch.tensor(self.k))))                              # Table 1
-        alpha_bart = alpha_t / (alpha_T + self.eps)                             # below Eq. (9)
-        sigma_bart = torch.sqrt(sigma_T**2 - sigma_t**2 + self.eps)             # below Eq. (9)
-        return sigma_t, sigma_T, sigma_bart, alpha_t, alpha_T, alpha_bart
-    def _mean(self, x0, y, t):
-        sigma_t, sigma_T, sigma_bart, alpha_t, alpha_T, alpha_bart = self._sigmas_alphas(t)
-        w_xt = alpha_t * sigma_bart**2 / (sigma_T**2 + self.eps)                # below Eq. (11)
-        w_yt = alpha_bart * sigma_t**2 / (sigma_T**2 + self.eps)                # below Eq. (11)
-        mu = w_xt[:, None, None, None] * x0 + w_yt[:, None, None, None] * y     # Eq. (11)
-        return mu
-    def _std(self, t):
-        sigma_t, sigma_T, sigma_bart, alpha_t, alpha_T, alpha_bart = self._sigmas_alphas(t)
-        sigma_xt = (alpha_t * sigma_bart * sigma_t) / (sigma_T + self.eps)
-        return sigma_xt
-    def marginal_prob(self, x0, y, t):
-        return self._mean(x0, y, t), self._std(t)
-    def prior_sampling(self, shape, y):
-        if shape != y.shape:
-            warnings.warn(f"Target shape {shape} does not match shape of y {y.shape}! Ignoring target shape.")
-        x_T = y
-        return x_T
-    def prior_logp(self, z):
         raise NotImplementedError("prior_logp for SBVE SDE not yet implemented!")

+"""
+Abstract SDE classes, Reverse SDE, and VE/VP SDEs.
+Taken and adapted from https://github.com/yang-song/score_sde_pytorch/blob/1618ddea340f3e4a2ed7852a0694a809775cf8d0/sde_lib.py
+"""
+import abc
+import warnings
+import numpy as np
+from sgmse.util.tensors import batch_broadcast
+import torch
+from sgmse.util.registry import Registry
+SDERegistry = Registry("SDE")
+class SDE(abc.ABC):
+    """SDE abstract class. Functions are designed for a mini-batch of inputs."""
+    def __init__(self, N):
+        """Construct an SDE.
+        Args:
+            N: number of discretization time steps.
+        """
+        super().__init__()
+        self.N = N
+    @property
+    @abc.abstractmethod
+    def T(self):
+        """End time of the SDE."""
+        pass
+    @abc.abstractmethod
+    def sde(self, x, t, *args):
+        pass
+    @abc.abstractmethod
+    def marginal_prob(self, x, t, *args):
+        """Parameters to determine the marginal distribution of the SDE, $p_t(x|args)$."""
+        pass
+    @abc.abstractmethod
+    def prior_sampling(self, shape, *args):
+        """Generate one sample from the prior distribution, $p_T(x|args)$ with shape `shape`."""
+        pass
+    @abc.abstractmethod
+    def prior_logp(self, z):
+        """Compute log-density of the prior distribution.
+        Useful for computing the log-likelihood via probability flow ODE.
+        Args:
+            z: latent code
+        Returns:
+            log probability density
+        """
+        pass
+    @staticmethod
+    @abc.abstractmethod
+    def add_argparse_args(parent_parser):
+        """
+        Add the necessary arguments for instantiation of this SDE class to an argparse ArgumentParser.
+        """
+        pass
+    def discretize(self, x, t, y, stepsize):
+        """Discretize the SDE in the form: x_{i+1} = x_i + f_i(x_i) + G_i z_i.
+        Useful for reverse diffusion sampling and probabiliy flow sampling.
+        Defaults to Euler-Maruyama discretization.
+        Args:
+            x: a torch tensor
+            t: a torch float representing the time step (from 0 to `self.T`)
+        Returns:
+            f, G
+        """
+        dt = stepsize
+        drift, diffusion = self.sde(x, t, y)
+        f = drift * dt
+        G = diffusion * torch.sqrt(dt)
+        return f, G
+    def reverse(oself, score_model, probability_flow=False):
+        """Create the reverse-time SDE/ODE.
+        Args:
+            score_model: A function that takes x, t and y and returns the score.
+            probability_flow: If `True`, create the reverse-time ODE used for probability flow sampling.
+        """
+        N = oself.N
+        T = oself.T
+        sde_fn = oself.sde
+        discretize_fn = oself.discretize
+        # Build the class for reverse-time SDE.
+        class RSDE(oself.__class__):
+            def __init__(self):
+                self.N = N
+                self.probability_flow = probability_flow
+            @property
+            def T(self):
+                return T
+            def sde(self, x, t, *args):
+                """Create the drift and diffusion functions for the reverse SDE/ODE."""
+                rsde_parts = self.rsde_parts(x, t, *args)
+                total_drift, diffusion = rsde_parts["total_drift"], rsde_parts["diffusion"]
+                return total_drift, diffusion
+            def rsde_parts(self, x, t, *args):
+                sde_drift, sde_diffusion = sde_fn(x, t, *args)
+                score = score_model(x, t, *args)
+                score_drift = -sde_diffusion[:, None, None, None]**2 * score * (0.5 if self.probability_flow else 1.)
+                diffusion = torch.zeros_like(sde_diffusion) if self.probability_flow else sde_diffusion
+                total_drift = sde_drift + score_drift
+                return {
+                    'total_drift': total_drift, 'diffusion': diffusion, 'sde_drift': sde_drift,
+                    'sde_diffusion': sde_diffusion, 'score_drift': score_drift, 'score': score,
+                }
+            def discretize(self, x, t, y, stepsize):
+                """Create discretized iteration rules for the reverse diffusion sampler."""
+                f, G = discretize_fn(x, t, y, stepsize)
+                rev_f = f - G[:, None, None, None] ** 2 * score_model(x, t, y) * (0.5 if self.probability_flow else 1.)
+                rev_G = torch.zeros_like(G) if self.probability_flow else G
+                return rev_f, rev_G
+        return RSDE()
+    @abc.abstractmethod
+    def copy(self):
+        pass
+@SDERegistry.register("ouve")
+class OUVESDE(SDE):
+    @staticmethod
+    def add_argparse_args(parser):
+        parser.add_argument("--theta", type=float, default=1.5, help="The constant stiffness of the Ornstein-Uhlenbeck process. 1.5 by default.")
+        parser.add_argument("--sigma-min", type=float, default=0.05, help="The minimum sigma to use. 0.05 by default.")
+        parser.add_argument("--sigma-max", type=float, default=0.5, help="The maximum sigma to use. 0.5 by default.")
+        parser.add_argument("--N", type=int, default=30, help="The number of timesteps in the SDE discretization. 30 by default")
+        parser.add_argument("--sampler_type", type=str, default="pc", help="Type of sampler to use. 'pc' by default.")
+        return parser
+    def __init__(self, theta, sigma_min, sigma_max, N=30, sampler_type="pc", **ignored_kwargs):
+        """Construct an Ornstein-Uhlenbeck Variance Exploding SDE.
+        Note that the "steady-state mean" `y` is not provided at construction, but must rather be given as an argument
+        to the methods which require it (e.g., `sde` or `marginal_prob`).
+        dx = -theta (y-x) dt + sigma(t) dw
+        with
+        sigma(t) = sigma_min (sigma_max/sigma_min)^t * sqrt(2 log(sigma_max/sigma_min))
+        Args:
+            theta: stiffness parameter.
+            sigma_min: smallest sigma.
+            sigma_max: largest sigma.
+            N: number of discretization steps
+        """
+        super().__init__(N)
+        self.theta = theta
+        self.sigma_min = sigma_min
+        self.sigma_max = sigma_max
+        self.logsig = np.log(self.sigma_max / self.sigma_min)
+        self.N = N
+        self.sampler_type = sampler_type
+    def copy(self):
+        return OUVESDE(self.theta, self.sigma_min, self.sigma_max, N=self.N, sampler_type=self.sampler_type)
+    @property
+    def T(self):
+        return 1
+    def sde(self, x, y, t):
+        drift = self.theta * (y - x)
+        # the sqrt(2*logsig) factor is required here so that logsig does not in the end affect the perturbation kernel
+        # standard deviation. this can be understood from solving the integral of [exp(2s) * g(s)^2] from s=0 to t
+        # with g(t) = sigma(t) as defined here, and seeing that `logsig` remains in the integral solution
+        # unless this sqrt(2*logsig) factor is included.
+        sigma = self.sigma_min * (self.sigma_max / self.sigma_min) ** t
+        diffusion = sigma * np.sqrt(2 * self.logsig)
+        return drift, diffusion
+    def _mean(self, x0, y, t):
+        theta = self.theta
+        exp_interp = torch.exp(-theta * t)[:, None, None, None]
+        return exp_interp * x0 + (1 - exp_interp) * y
+    def alpha(self, t):
+        return torch.exp(-self.theta * t)
+    def _std(self, t):
+        # This is a full solution to the ODE for P(t) in our derivations, after choosing g(s) as in self.sde()
+        sigma_min, theta, logsig = self.sigma_min, self.theta, self.logsig
+        # could maybe replace the two torch.exp(... * t) terms here by cached values **t
+        return torch.sqrt(
+            (
+                sigma_min**2
+                * torch.exp(-2 * theta * t)
+                * (torch.exp(2 * (theta + logsig) * t) - 1)
+                * logsig
+            )
+            /
+            (theta + logsig)
+        )
+    def marginal_prob(self, x0, y, t):
+        return self._mean(x0, y, t), self._std(t)
+    def prior_sampling(self, shape, y):
+        if shape != y.shape:
+            warnings.warn(f"Target shape {shape} does not match shape of y {y.shape}! Ignoring target shape.")
+        std = self._std(torch.ones((y.shape[0],), device=y.device))
+        x_T = y + torch.randn_like(y) * std[:, None, None, None]
+        return x_T
+    def prior_logp(self, z):
+        raise NotImplementedError("prior_logp for OU SDE not yet implemented!")
+@SDERegistry.register("ouvp")
+class OUVPSDE(SDE):
+    # !!! We do not utilize this SDE in our works due to observed instabilities around t=0.2. !!!
+    @staticmethod
+    def add_argparse_args(parser):
+        parser.add_argument("--sde-n", type=int, default=1000,
+            help="The number of timesteps in the SDE discretization. 1000 by default")
+        parser.add_argument("--beta-min", type=float, required=True,
+            help="The minimum beta to use.")
+        parser.add_argument("--beta-max", type=float, required=True,
+            help="The maximum beta to use.")
+        parser.add_argument("--stiffness", type=float, default=1,
+            help="The stiffness factor for the drift, to be multiplied by 0.5*beta(t). 1 by default.")
+        return parser
+    def __init__(self, beta_min, beta_max, stiffness=1, N=1000, **ignored_kwargs):
+        """
+        !!! We do not utilize this SDE in our works due to observed instabilities around t=0.2. !!!
+        Construct an Ornstein-Uhlenbeck Variance Preserving SDE:
+        dx = -1/2 * beta(t) * stiffness * (y-x) dt + sqrt(beta(t)) * dw
+        with
+        beta(t) = beta_min + t(beta_max - beta_min)
+        Note that the "steady-state mean" `y` is not provided at construction, but must rather be given as an argument
+        to the methods which require it (e.g., `sde` or `marginal_prob`).
+        Args:
+            beta_min: smallest sigma.
+            beta_max: largest sigma.
+            stiffness: stiffness factor of the drift. 1 by default.
+            N: number of discretization steps
+        """
+        super().__init__(N)
+        self.beta_min = beta_min
+        self.beta_max = beta_max
+        self.stiffness = stiffness
+        self.N = N
+    def copy(self):
+        return OUVPSDE(self.beta_min, self.beta_max, self.stiffness, N=self.N)
+    @property
+    def T(self):
+        return 1
+    def _beta(self, t):
+        return self.beta_min + t * (self.beta_max - self.beta_min)
+    def sde(self, x, t, y):
+        drift = 0.5 * self.stiffness * batch_broadcast(self._beta(t), y) * (y - x)
+        diffusion = torch.sqrt(self._beta(t))
+        return drift, diffusion
+    def _mean(self, x0, t, y):
+        b0, b1, s = self.beta_min, self.beta_max, self.stiffness
+        x0y_fac = torch.exp(-0.25 * s * t * (t * (b1-b0) + 2 * b0))[:, None, None, None]
+        return y + x0y_fac * (x0 - y)
+    def _std(self, t):
+        b0, b1, s = self.beta_min, self.beta_max, self.stiffness
+        return (1 - torch.exp(-0.5 * s * t * (t * (b1-b0) + 2 * b0))) / s
+    def marginal_prob(self, x0, t, y):
+        return self._mean(x0, t, y), self._std(t)
+    def prior_sampling(self, shape, y):
+        if shape != y.shape:
+            warnings.warn(f"Target shape {shape} does not match shape of y {y.shape}! Ignoring target shape.")
+        std = self._std(torch.ones((y.shape[0],), device=y.device))
+        x_T = y + torch.randn_like(y) * std[:, None, None, None]
+        return x_T
+    def prior_logp(self, z):
+        raise NotImplementedError("prior_logp for OU SDE not yet implemented!")
+@SDERegistry.register("sbve")
+class SBVESDE(SDE):
+    @staticmethod
+    def add_argparse_args(parser):
+        parser.add_argument("--N", type=int, default=50, help="The number of timesteps in the SDE discretization. 50 by default")
+        parser.add_argument("--k", type=float, default=2.6, help="Parameter of the diffusion coefficient. 2.6 by default.")
+        parser.add_argument("--c", type=float, default=0.4, help="Parameter of the diffusion coefficient. 0.4 by default.")
+        parser.add_argument("--eps", type=float, default=1e-8, help="Small constant to avoid numerical instability. 1e-8 by default.")
+        parser.add_argument("--sampler_type", type=str, default="ode")
+        return parser
+    def __init__(self, k, c, N=50, eps=1e-8, sampler_type="ode", **ignored_kwargs):
+        """Construct a Schrodinger Bridge with Variance Exploding SDE.
+        As described in Jukić et al., „Schrödinger Bridge for Generative Speech Enhancement“, 2024.
+        Args:
+            k: stiffness parameter.
+            c: diffusion parameter.
+            N: number of discretization steps
+        """
+        super().__init__(N)
+        self.k = k
+        self.c = c
+        self.N = N
+        self.eps = eps
+        self.sampler_type = sampler_type
+    def copy(self):
+        return SBVESDE(self.k, self.c, N=self.N)
+    @property
+    def T(self):
+        return 1
+    def sde(self, x, y, t):
+        f = 0.0                                                                 # Table 1
+        g = torch.sqrt(torch.tensor(self.c)) * self.k**(t)                      # Table 1
+        return f, g
+    def _sigmas_alphas(self, t):
+        alpha_t = torch.ones_like(t)
+        alpha_T = torch.ones_like(t)
+        sigma_t = torch.sqrt((self.c*(self.k**(2*t)-1.0)) \
+            / (2*torch.log(torch.tensor(self.k))))                              # Table 1
+        sigma_T = torch.sqrt((self.c*(self.k**(2*self.T)-1.0)) \
+            / (2*torch.log(torch.tensor(self.k))))                              # Table 1
+        alpha_bart = alpha_t / (alpha_T + self.eps)                             # below Eq. (9)
+        sigma_bart = torch.sqrt(sigma_T**2 - sigma_t**2 + self.eps)             # below Eq. (9)
+        return sigma_t, sigma_T, sigma_bart, alpha_t, alpha_T, alpha_bart
+    def _mean(self, x0, y, t):
+        sigma_t, sigma_T, sigma_bart, alpha_t, alpha_T, alpha_bart = self._sigmas_alphas(t)
+        w_xt = alpha_t * sigma_bart**2 / (sigma_T**2 + self.eps)                # below Eq. (11)
+        w_yt = alpha_bart * sigma_t**2 / (sigma_T**2 + self.eps)                # below Eq. (11)
+        mu = w_xt[:, None, None, None] * x0 + w_yt[:, None, None, None] * y     # Eq. (11)
+        return mu
+    def _std(self, t):
+        sigma_t, sigma_T, sigma_bart, alpha_t, alpha_T, alpha_bart = self._sigmas_alphas(t)
+        sigma_xt = (alpha_t * sigma_bart * sigma_t) / (sigma_T + self.eps)
+        return sigma_xt
+    def marginal_prob(self, x0, y, t):
+        return self._mean(x0, y, t), self._std(t)
+    def prior_sampling(self, shape, y):
+        if shape != y.shape:
+            warnings.warn(f"Target shape {shape} does not match shape of y {y.shape}! Ignoring target shape.")
+        x_T = y
+        return x_T
+    def prior_logp(self, z):
         raise NotImplementedError("prior_logp for SBVE SDE not yet implemented!")

sgmse/util/inference.py CHANGED Viewed

@@ -1,64 +1,64 @@
-import torch
-from torchaudio import load
-from pesq import pesq
-from pystoi import stoi
-from .other import si_sdr, pad_spec
-# Settings
-sr = 16000
-snr = 0.5
-N = 30
-corrector_steps = 1
-def evaluate_model(model, num_eval_files):
-    clean_files = model.data_module.valid_set.clean_files
-    noisy_files = model.data_module.valid_set.noisy_files
-    # Select test files uniformly accros validation files
-    total_num_files = len(clean_files)
-    indices = torch.linspace(0, total_num_files-1, num_eval_files, dtype=torch.int)
-    clean_files = list(clean_files[i] for i in indices)
-    noisy_files = list(noisy_files[i] for i in indices)
-    _pesq = 0
-    _si_sdr = 0
-    _estoi = 0
-    # iterate over files
-    for (clean_file, noisy_file) in zip(clean_files, noisy_files):
-        # Load wavs
-        x, _ = load(clean_file)
-        y, _ = load(noisy_file)
-        T_orig = x.size(1)
-        # Normalize per utterance
-        norm_factor = y.abs().max()
-        y = y / norm_factor
-        # Prepare DNN input
-        Y = torch.unsqueeze(model._forward_transform(model._stft(y.cuda())), 0)
-        Y = pad_spec(Y)
-        y = y * norm_factor
-        # Reverse sampling
-        sampler = model.get_pc_sampler(
-            'reverse_diffusion', 'ald', Y.cuda(), N=N,
-            corrector_steps=corrector_steps, snr=snr)
-        sample, _ = sampler()
-        x_hat = model.to_audio(sample.squeeze(), T_orig)
-        x_hat = x_hat * norm_factor
-        x_hat = x_hat.squeeze().cpu().numpy()
-        x = x.squeeze().cpu().numpy()
-        y = y.squeeze().cpu().numpy()
-        _si_sdr += si_sdr(x, x_hat)
-        _pesq += pesq(sr, x, x_hat, 'wb')
-        _estoi += stoi(x, x_hat, sr, extended=True)
-    return _pesq/num_eval_files, _si_sdr/num_eval_files, _estoi/num_eval_files

+import torch
+from torchaudio import load
+from pesq import pesq
+from pystoi import stoi
+from .other import si_sdr, pad_spec
+# Settings
+sr = 16000
+snr = 0.5
+N = 30
+corrector_steps = 1
+def evaluate_model(model, num_eval_files):
+    clean_files = model.data_module.valid_set.clean_files
+    noisy_files = model.data_module.valid_set.noisy_files
+    # Select test files uniformly accros validation files
+    total_num_files = len(clean_files)
+    indices = torch.linspace(0, total_num_files-1, num_eval_files, dtype=torch.int)
+    clean_files = list(clean_files[i] for i in indices)
+    noisy_files = list(noisy_files[i] for i in indices)
+    _pesq = 0
+    _si_sdr = 0
+    _estoi = 0
+    # iterate over files
+    for (clean_file, noisy_file) in zip(clean_files, noisy_files):
+        # Load wavs
+        x, _ = load(clean_file)
+        y, _ = load(noisy_file)
+        T_orig = x.size(1)
+        # Normalize per utterance
+        norm_factor = y.abs().max()
+        y = y / norm_factor
+        # Prepare DNN input
+        Y = torch.unsqueeze(model._forward_transform(model._stft(y.cuda())), 0)
+        Y = pad_spec(Y)
+        y = y * norm_factor
+        # Reverse sampling
+        sampler = model.get_pc_sampler(
+            'reverse_diffusion', 'ald', Y.cuda(), N=N,
+            corrector_steps=corrector_steps, snr=snr)
+        sample, _ = sampler()
+        x_hat = model.to_audio(sample.squeeze(), T_orig)
+        x_hat = x_hat * norm_factor
+        x_hat = x_hat.squeeze().cpu().numpy()
+        x = x.squeeze().cpu().numpy()
+        y = y.squeeze().cpu().numpy()
+        _si_sdr += si_sdr(x, x_hat)
+        _pesq += pesq(sr, x, x_hat, 'wb')
+        _estoi += stoi(x, x_hat, sr, extended=True)
+    return _pesq/num_eval_files, _si_sdr/num_eval_files, _estoi/num_eval_files

sgmse/util/other.py CHANGED Viewed

@@ -1,141 +1,141 @@
-import os
-import torch
-import numpy as np
-import scipy.stats
-from scipy.signal import butter, sosfilt
-from pesq import pesq
-from pystoi import stoi
-def si_sdr_components(s_hat, s, n):
-    # s_target
-    alpha_s = np.dot(s_hat, s) / np.linalg.norm(s)**2
-    s_target = alpha_s * s
-    # e_noise
-    alpha_n = np.dot(s_hat, n) / np.linalg.norm(n)**2
-    e_noise = alpha_n * n
-    # e_art
-    e_art = s_hat - s_target - e_noise
-    return s_target, e_noise, e_art
-def energy_ratios(s_hat, s, n):
-    s_target, e_noise, e_art = si_sdr_components(s_hat, s, n)
-    si_sdr = 10*np.log10(np.linalg.norm(s_target)**2 / np.linalg.norm(e_noise + e_art)**2)
-    si_sir = 10*np.log10(np.linalg.norm(s_target)**2 / np.linalg.norm(e_noise)**2)
-    si_sar = 10*np.log10(np.linalg.norm(s_target)**2 / np.linalg.norm(e_art)**2)
-    return si_sdr, si_sir, si_sar
-def mean_conf_int(data, confidence=0.95):
-    a = 1.0 * np.array(data)
-    n = len(a)
-    m, se = np.mean(a), scipy.stats.sem(a)
-    h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1)
-    return m, h
-class Method():
-    def __init__(self, name, base_dir, metrics):
-        self.name = name
-        self.base_dir = base_dir
-        self.metrics = {}
-        for i in range(len(metrics)):
-            metric = metrics[i]
-            value = []
-            self.metrics[metric] = value
-    def append(self, matric, value):
-        self.metrics[matric].append(value)
-    def get_mean_ci(self, metric):
-        return mean_conf_int(np.array(self.metrics[metric]))
-def hp_filter(signal, cut_off=80, order=10, sr=16000):
-    factor = cut_off /sr * 2
-    sos = butter(order, factor, 'hp', output='sos')
-    filtered = sosfilt(sos, signal)
-    return filtered
-def si_sdr(s, s_hat):
-    alpha = np.dot(s_hat, s)/np.linalg.norm(s)**2
-    sdr = 10*np.log10(np.linalg.norm(alpha*s)**2/np.linalg.norm(
-        alpha*s - s_hat)**2)
-    return sdr
-def snr_dB(s,n):
-    s_power = 1/len(s)*np.sum(s**2)
-    n_power = 1/len(n)*np.sum(n**2)
-    snr_dB = 10*np.log10(s_power/n_power)
-    return snr_dB
-def pad_spec(Y, mode="zero_pad"):
-    T = Y.size(3)
-    if T%64 !=0:
-        num_pad = 64-T%64
-    else:
-        num_pad = 0
-    if mode == "zero_pad":
-        pad2d = torch.nn.ZeroPad2d((0, num_pad, 0,0))
-    elif mode == "reflection":
-        pad2d = torch.nn.ReflectionPad2d((0, num_pad, 0,0))
-    elif mode == "replication":
-        pad2d = torch.nn.ReplicationPad2d((0, num_pad, 0,0))
-    else:
-        raise NotImplementedError("This function hasn't been implemented yet.")
-    return pad2d(Y)
-def ensure_dir(file_path):
-    directory = file_path
-    if not os.path.exists(directory):
-        os.makedirs(directory)
-def print_metrics(x, y, x_hat_list, labels, sr=16000):
-    _si_sdr_mix = si_sdr(x, y)
-    _pesq_mix = pesq(sr, x, y, 'wb')
-    _estoi_mix = stoi(x, y, sr, extended=True)
-    print(f'Mixture:  PESQ: {_pesq_mix:.2f}, ESTOI: {_estoi_mix:.2f}, SI-SDR: {_si_sdr_mix:.2f}')
-    for i, x_hat in enumerate(x_hat_list):
-        _si_sdr = si_sdr(x, x_hat)
-        _pesq = pesq(sr, x, x_hat, 'wb')
-        _estoi = stoi(x, x_hat, sr, extended=True)
-        print(f'{labels[i]}: {_pesq:.2f}, ESTOI: {_estoi:.2f}, SI-SDR: {_si_sdr:.2f}')
-def mean_std(data):
-    data = data[~np.isnan(data)]
-    mean = np.mean(data)
-    std = np.std(data)
-    return mean, std
-def print_mean_std(data, decimal=2):
-    data = np.array(data)
-    data = data[~np.isnan(data)]
-    mean = np.mean(data)
-    std = np.std(data)
-    if decimal == 2:
-        string = f'{mean:.2f} ± {std:.2f}'
-    elif decimal == 1:
-        string = f'{mean:.1f} ± {std:.1f}'
-    return string
-def set_torch_cuda_arch_list():
-    if not torch.cuda.is_available():
-        print("CUDA is not available. No GPUs found.")
-        return
-    num_gpus = torch.cuda.device_count()
-    compute_capabilities = []
-    for i in range(num_gpus):
-        cc_major, cc_minor = torch.cuda.get_device_capability(i)
-        cc = f"{cc_major}.{cc_minor}"
-        compute_capabilities.append(cc)
-    cc_string = ";".join(compute_capabilities)
-    os.environ['TORCH_CUDA_ARCH_LIST'] = cc_string
     print(f"Set TORCH_CUDA_ARCH_LIST to: {cc_string}")

+import os
+import torch
+import numpy as np
+import scipy.stats
+from scipy.signal import butter, sosfilt
+from pesq import pesq
+from pystoi import stoi
+def si_sdr_components(s_hat, s, n):
+    # s_target
+    alpha_s = np.dot(s_hat, s) / np.linalg.norm(s)**2
+    s_target = alpha_s * s
+    # e_noise
+    alpha_n = np.dot(s_hat, n) / np.linalg.norm(n)**2
+    e_noise = alpha_n * n
+    # e_art
+    e_art = s_hat - s_target - e_noise
+    return s_target, e_noise, e_art
+def energy_ratios(s_hat, s, n):
+    s_target, e_noise, e_art = si_sdr_components(s_hat, s, n)
+    si_sdr = 10*np.log10(np.linalg.norm(s_target)**2 / np.linalg.norm(e_noise + e_art)**2)
+    si_sir = 10*np.log10(np.linalg.norm(s_target)**2 / np.linalg.norm(e_noise)**2)
+    si_sar = 10*np.log10(np.linalg.norm(s_target)**2 / np.linalg.norm(e_art)**2)
+    return si_sdr, si_sir, si_sar
+def mean_conf_int(data, confidence=0.95):
+    a = 1.0 * np.array(data)
+    n = len(a)
+    m, se = np.mean(a), scipy.stats.sem(a)
+    h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1)
+    return m, h
+class Method():
+    def __init__(self, name, base_dir, metrics):
+        self.name = name
+        self.base_dir = base_dir
+        self.metrics = {}
+        for i in range(len(metrics)):
+            metric = metrics[i]
+            value = []
+            self.metrics[metric] = value
+    def append(self, matric, value):
+        self.metrics[matric].append(value)
+    def get_mean_ci(self, metric):
+        return mean_conf_int(np.array(self.metrics[metric]))
+def hp_filter(signal, cut_off=80, order=10, sr=16000):
+    factor = cut_off /sr * 2
+    sos = butter(order, factor, 'hp', output='sos')
+    filtered = sosfilt(sos, signal)
+    return filtered
+def si_sdr(s, s_hat):
+    alpha = np.dot(s_hat, s)/np.linalg.norm(s)**2
+    sdr = 10*np.log10(np.linalg.norm(alpha*s)**2/np.linalg.norm(
+        alpha*s - s_hat)**2)
+    return sdr
+def snr_dB(s,n):
+    s_power = 1/len(s)*np.sum(s**2)
+    n_power = 1/len(n)*np.sum(n**2)
+    snr_dB = 10*np.log10(s_power/n_power)
+    return snr_dB
+def pad_spec(Y, mode="zero_pad"):
+    T = Y.size(3)
+    if T%64 !=0:
+        num_pad = 64-T%64
+    else:
+        num_pad = 0
+    if mode == "zero_pad":
+        pad2d = torch.nn.ZeroPad2d((0, num_pad, 0,0))
+    elif mode == "reflection":
+        pad2d = torch.nn.ReflectionPad2d((0, num_pad, 0,0))
+    elif mode == "replication":
+        pad2d = torch.nn.ReplicationPad2d((0, num_pad, 0,0))
+    else:
+        raise NotImplementedError("This function hasn't been implemented yet.")
+    return pad2d(Y)
+def ensure_dir(file_path):
+    directory = file_path
+    if not os.path.exists(directory):
+        os.makedirs(directory)
+def print_metrics(x, y, x_hat_list, labels, sr=16000):
+    _si_sdr_mix = si_sdr(x, y)
+    _pesq_mix = pesq(sr, x, y, 'wb')
+    _estoi_mix = stoi(x, y, sr, extended=True)
+    print(f'Mixture:  PESQ: {_pesq_mix:.2f}, ESTOI: {_estoi_mix:.2f}, SI-SDR: {_si_sdr_mix:.2f}')
+    for i, x_hat in enumerate(x_hat_list):
+        _si_sdr = si_sdr(x, x_hat)
+        _pesq = pesq(sr, x, x_hat, 'wb')
+        _estoi = stoi(x, x_hat, sr, extended=True)
+        print(f'{labels[i]}: {_pesq:.2f}, ESTOI: {_estoi:.2f}, SI-SDR: {_si_sdr:.2f}')
+def mean_std(data):
+    data = data[~np.isnan(data)]
+    mean = np.mean(data)
+    std = np.std(data)
+    return mean, std
+def print_mean_std(data, decimal=2):
+    data = np.array(data)
+    data = data[~np.isnan(data)]
+    mean = np.mean(data)
+    std = np.std(data)
+    if decimal == 2:
+        string = f'{mean:.2f} ± {std:.2f}'
+    elif decimal == 1:
+        string = f'{mean:.1f} ± {std:.1f}'
+    return string
+def set_torch_cuda_arch_list():
+    if not torch.cuda.is_available():
+        print("CUDA is not available. No GPUs found.")
+        return
+    num_gpus = torch.cuda.device_count()
+    compute_capabilities = []
+    for i in range(num_gpus):
+        cc_major, cc_minor = torch.cuda.get_device_capability(i)
+        cc = f"{cc_major}.{cc_minor}"
+        compute_capabilities.append(cc)
+    cc_string = ";".join(compute_capabilities)
+    os.environ['TORCH_CUDA_ARCH_LIST'] = cc_string
     print(f"Set TORCH_CUDA_ARCH_LIST to: {cc_string}")

sgmse/util/registry.py CHANGED Viewed

@@ -1,34 +1,34 @@
-import warnings
-from typing import Callable
-class Registry:
-    def __init__(self, managed_thing: str):
-        """
-        Create a new registry.
-        Args:
-            managed_thing: A string describing what type of thing is managed by this registry. Will be used for
-                warnings and errors, so it's a good idea to keep this string globally unique and easily understood.
-        """
-        self.managed_thing = managed_thing
-        self._registry = {}
-    def register(self, name: str) -> Callable:
-        def inner_wrapper(wrapped_class) -> Callable:
-            if name in self._registry:
-                warnings.warn(f"{self.managed_thing} with name '{name}' doubly registered, old class will be replaced.")
-            self._registry[name] = wrapped_class
-            return wrapped_class
-        return inner_wrapper
-    def get_by_name(self, name: str):
-        """Get a managed thing by name."""
-        if name in self._registry:
-            return self._registry[name]
-        else:
-            raise ValueError(f"{self.managed_thing} with name '{name}' unknown.")
-    def get_all_names(self):
-        """Get the list of things' names registered to this registry."""
-        return list(self._registry.keys())

+import warnings
+from typing import Callable
+class Registry:
+    def __init__(self, managed_thing: str):
+        """
+        Create a new registry.
+        Args:
+            managed_thing: A string describing what type of thing is managed by this registry. Will be used for
+                warnings and errors, so it's a good idea to keep this string globally unique and easily understood.
+        """
+        self.managed_thing = managed_thing
+        self._registry = {}
+    def register(self, name: str) -> Callable:
+        def inner_wrapper(wrapped_class) -> Callable:
+            if name in self._registry:
+                warnings.warn(f"{self.managed_thing} with name '{name}' doubly registered, old class will be replaced.")
+            self._registry[name] = wrapped_class
+            return wrapped_class
+        return inner_wrapper
+    def get_by_name(self, name: str):
+        """Get a managed thing by name."""
+        if name in self._registry:
+            return self._registry[name]
+        else:
+            raise ValueError(f"{self.managed_thing} with name '{name}' unknown.")
+    def get_all_names(self):
+        """Get the list of things' names registered to this registry."""
+        return list(self._registry.keys())

sgmse/util/tensors.py CHANGED Viewed

@@ -1,16 +1,16 @@
-def batch_broadcast(a, x):
-    """Broadcasts a over all dimensions of x, except the batch dimension, which must match."""
-    if len(a.shape) != 1:
-        a = a.squeeze()
-        if len(a.shape) != 1:
-            raise ValueError(
-                f"Don't know how to batch-broadcast tensor `a` with more than one effective dimension (shape {a.shape})"
-            )
-    if a.shape[0] != x.shape[0] and a.shape[0] != 1:
-        raise ValueError(
-            f"Don't know how to batch-broadcast shape {a.shape} over {x.shape} as the batch dimension is not matching")
-    out = a.view((x.shape[0], *(1 for _ in range(len(x.shape)-1))))
-    return out

+def batch_broadcast(a, x):
+    """Broadcasts a over all dimensions of x, except the batch dimension, which must match."""
+    if len(a.shape) != 1:
+        a = a.squeeze()
+        if len(a.shape) != 1:
+            raise ValueError(
+                f"Don't know how to batch-broadcast tensor `a` with more than one effective dimension (shape {a.shape})"
+            )
+    if a.shape[0] != x.shape[0] and a.shape[0] != 1:
+        raise ValueError(
+            f"Don't know how to batch-broadcast shape {a.shape} over {x.shape} as the batch dimension is not matching")
+    out = a.view((x.shape[0], *(1 for _ in range(len(x.shape)-1))))
+    return out