Spaces:

ZJW666
/

ProjectedGANCLC

Runtime error

App Files Files Community

ZJW666 commited on Aug 22, 2023

Commit

7a59a55

1 Parent(s): 774a6a3

fist version

Browse files

Files changed (46) hide show

app.py +130 -0
dnnlib/__init__.py +9 -0
dnnlib/__pycache__/__init__.cpython-39.pyc +0 -0
dnnlib/__pycache__/util.cpython-39.pyc +0 -0
dnnlib/util.py +491 -0
feature_networks/__pycache__/constants.cpython-39.pyc +0 -0
feature_networks/__pycache__/pretrained_builder.cpython-39.pyc +0 -0
feature_networks/__pycache__/vit.cpython-39.pyc +0 -0
feature_networks/clip/__init__.py +1 -0
feature_networks/clip/__pycache__/__init__.cpython-39.pyc +0 -0
feature_networks/clip/__pycache__/clip.cpython-39.pyc +0 -0
feature_networks/clip/__pycache__/model.cpython-39.pyc +0 -0
feature_networks/clip/__pycache__/simple_tokenizer.cpython-39.pyc +0 -0
feature_networks/clip/bpe_simple_vocab_16e6.txt.gz +3 -0
feature_networks/clip/clip.py +244 -0
feature_networks/clip/model.py +453 -0
feature_networks/clip/simple_tokenizer.py +132 -0
feature_networks/constants.py +129 -0
feature_networks/pretrained_builder.py +417 -0
feature_networks/vit.py +436 -0
legacy.py +331 -0
misc.py +275 -0
pg_modules/__init__.py +0 -0
pg_modules/__pycache__/MViT.cpython-39.pyc +0 -0
pg_modules/__pycache__/__init__.cpython-39.pyc +0 -0
pg_modules/__pycache__/blocks.cpython-38.pyc +0 -0
pg_modules/__pycache__/blocks.cpython-39.pyc +0 -0
pg_modules/__pycache__/diffaug.cpython-38.pyc +0 -0
pg_modules/__pycache__/diffaug.cpython-39.pyc +0 -0
pg_modules/__pycache__/discriminator.cpython-38.pyc +0 -0
pg_modules/__pycache__/discriminator.cpython-39.pyc +0 -0
pg_modules/__pycache__/mae.cpython-39.pyc +0 -0
pg_modules/__pycache__/models_tnt.cpython-39.pyc +0 -0
pg_modules/__pycache__/networks_fastgan.cpython-38.pyc +0 -0
pg_modules/__pycache__/networks_fastgan.cpython-39.pyc +0 -0
pg_modules/__pycache__/networks_stylegan2.cpython-39.pyc +0 -0
pg_modules/__pycache__/projector.cpython-38.pyc +0 -0
pg_modules/__pycache__/projector.cpython-39.pyc +0 -0
pg_modules/__pycache__/simmim.cpython-39.pyc +0 -0
pg_modules/__pycache__/vision_transformer.cpython-39.pyc +0 -0
pg_modules/blocks.py +370 -0
pg_modules/diffaug.py +76 -0
pg_modules/discriminator.py +153 -0
pg_modules/networks_fastgan.py +180 -0
pg_modules/networks_stylegan2.py +537 -0
pg_modules/projector.py +158 -0

app.py ADDED Viewed

	@@ -0,0 +1,130 @@

+import sys
+import os
+import gradio as gr
+from PIL import Image
+"""Generate images using pretrained network pickle."""
+import re
+from typing import List, Optional, Tuple, Union
+import click
+import dnnlib
+import numpy as np
+import PIL.Image
+import torch
+import legacy
+from huggingface_hub import hf_hub_url
+#----------------------------------------------------------------------------
+def parse_range(s: Union[str, List]) -> List[int]:
+    '''Parse a comma separated list of numbers or ranges and return a list of ints.
+    Example: '1,2,5-10' returns [1, 2, 5, 6, 7]
+    '''
+    if isinstance(s, list): return s
+    ranges = []
+    range_re = re.compile(r'^(\d+)-(\d+)$')
+    for p in s.split(','):
+        m = range_re.match(p)
+        if m:
+            ranges.extend(range(int(m.group(1)), int(m.group(2))+1))
+        else:
+            ranges.append(int(p))
+    return ranges
+#----------------------------------------------------------------------------
+def parse_vec2(s: Union[str, Tuple[float, float]]) -> Tuple[float, float]:
+    '''Parse a floating point 2-vector of syntax 'a,b'.
+    Example:
+        '0,1' returns (0,1)
+    '''
+    if isinstance(s, tuple): return s
+    parts = s.split(',')
+    if len(parts) == 2:
+        return (float(parts[0]), float(parts[1]))
+    raise ValueError(f'cannot parse 2-vector {s}')
+#----------------------------------------------------------------------------
+def make_transform(translate: Tuple[float,float], angle: float):
+    m = np.eye(3)
+    s = np.sin(angle/360.0*np.pi*2)
+    c = np.cos(angle/360.0*np.pi*2)
+    m[0][0] = c
+    m[0][1] = s
+    m[0][2] = translate[0]
+    m[1][0] = -s
+    m[1][1] = c
+    m[1][2] = translate[1]
+    return m
+#----------------------------------------------------------------------------
+device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+# config_file_url = hf_hub_url("autonomousvision/Projected_GAN_Pokemon", filename="pokemon.pkl")
+# config_file_url = r'E:\桌面\Preparation of Papers for IEEE Signal Processing Letters (5-page limit)\codes\pokemon.pkl'
+# with dnnlib.util.open_url(config_file_url) as f:
+#     G = legacy.load_network_pkl(f)['G_ema'].to(device) # type: ignore
+# models = {
+#     'pokemon':
+# }
+# base_path =
+models = dict()
+for i in ["pokemon", "art-paint", "flowers", "landscapes","obama"]:
+    with dnnlib.util.open_url("E:\桌面\Preparation of Papers for IEEE Signal Processing Letters (5-page limit)\codes\projected-gan-clc - 副本\\" +i+".pkl") as f:
+        models[i] = legacy.load_network_pkl(f)['G_ema']
+def generate_images(seeds, name):
+    """Generate images using pretrained network pickle.
+    Examples:
+    \b
+    # Generate an image using pre-trained AFHQv2 model ("Ours" in Figure 1, left).
+    python gen_images.py --outdir=out --trunc=1 --seeds=2 \\
+        --network=https://api.ngc.nvidia.com/v2/models/nvidia/research/stylegan3/versions/1/files/stylegan3-r-afhqv2-512x512.pkl
+    \b
+    # Generate uncurated images with truncation using the MetFaces-U dataset
+    python gen_images.py --outdir=out --trunc=0.7 --seeds=600-605 \\
+        --network=https://api.ngc.nvidia.com/v2/models/nvidia/research/stylegan3/versions/1/files/stylegan3-t-metfacesu-1024x1024.pkl
+    """
+    # models
+    G = models[name].to(device)
+    # Labels.
+    label = torch.zeros([1, G.c_dim], device=device)
+    # Generate images.
+    for seed_idx, seed in enumerate(seeds):
+        print('Generating image for seed %d (%d/%d) ...' % (seed, seed_idx, len(seeds)))
+        z = torch.from_numpy(np.random.RandomState(seed).randn(1, G.z_dim)).to(device).float()
+        # Construct an inverse rotation/translation matrix and pass to the generator.  The
+        # generator expects this matrix as an inverse to avoid potentially failing numerical
+        # operations in the network.
+        if hasattr(G.synthesis, 'input'):
+            m = make_transform('0,0', 0)
+            m = np.linalg.inv(m)
+            G.synthesis.input.transform.copy_(torch.from_numpy(m))
+        img = G(z, label, truncation_psi=1, noise_mode='const')
+        img = (img.permute(0, 2, 3, 1) * 127.5 + 128).clamp(0, 255).to(torch.uint8)
+        pilimg = PIL.Image.fromarray(img[0].cpu().numpy(), 'RGB')
+    return pilimg
+def inference(seedin, name = None):
+    print(name)
+    listseed = [int(seedin)]
+    output = generate_images(listseed, name)
+    return output
+title = "Projected GAN CLC"
+description = "Gradio demo for Projected GANs CLC, Pokemon."
+gr.Interface(fn=inference,inputs=[gr.Slider(label="Seed",minimum=0, maximum=5000, step=1, value=0), gr.Radio(["pokemon", "art-paint", "flowers", "landscapes","obama"], label='Dataset', value='art-paint')],outputs=["image"],title=title,description=description
+    ).launch()

dnnlib/__init__.py ADDED Viewed

	@@ -0,0 +1,9 @@

+# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+from .util import EasyDict, make_cache_dir_path

dnnlib/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (285 Bytes). View file

dnnlib/__pycache__/util.cpython-39.pyc ADDED Viewed

Binary file (14.1 kB). View file

dnnlib/util.py ADDED Viewed

	@@ -0,0 +1,491 @@

+# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+"""Miscellaneous utility classes and functions."""
+import ctypes
+import fnmatch
+import importlib
+import inspect
+import numpy as np
+import os
+import shutil
+import sys
+import types
+import io
+import pickle
+import re
+import requests
+import html
+import hashlib
+import glob
+import tempfile
+import urllib
+import urllib.request
+import uuid
+from distutils.util import strtobool
+from typing import Any, List, Tuple, Union
+# Util classes
+# ------------------------------------------------------------------------------------------
+class EasyDict(dict):
+    """Convenience class that behaves like a dict but allows access with the attribute syntax."""
+    def __getattr__(self, name: str) -> Any:
+        try:
+            return self[name]
+        except KeyError:
+            raise AttributeError(name)
+    def __setattr__(self, name: str, value: Any) -> None:
+        self[name] = value
+    def __delattr__(self, name: str) -> None:
+        del self[name]
+class Logger(object):
+    """Redirect stderr to stdout, optionally print stdout to a file, and optionally force flushing on both stdout and the file."""
+    def __init__(self, file_name: str = None, file_mode: str = "w", should_flush: bool = True):
+        self.file = None
+        if file_name is not None:
+            self.file = open(file_name, file_mode)
+        self.should_flush = should_flush
+        self.stdout = sys.stdout
+        self.stderr = sys.stderr
+        sys.stdout = self
+        sys.stderr = self
+    def __enter__(self) -> "Logger":
+        return self
+    def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
+        self.close()
+    def write(self, text: Union[str, bytes]) -> None:
+        """Write text to stdout (and a file) and optionally flush."""
+        if isinstance(text, bytes):
+            text = text.decode()
+        if len(text) == 0: # workaround for a bug in VSCode debugger: sys.stdout.write(''); sys.stdout.flush() => crash
+            return
+        if self.file is not None:
+            self.file.write(text)
+        self.stdout.write(text)
+        if self.should_flush:
+            self.flush()
+    def flush(self) -> None:
+        """Flush written text to both stdout and a file, if open."""
+        if self.file is not None:
+            self.file.flush()
+        self.stdout.flush()
+    def close(self) -> None:
+        """Flush, close possible files, and remove stdout/stderr mirroring."""
+        self.flush()
+        # if using multiple loggers, prevent closing in wrong order
+        if sys.stdout is self:
+            sys.stdout = self.stdout
+        if sys.stderr is self:
+            sys.stderr = self.stderr
+        if self.file is not None:
+            self.file.close()
+            self.file = None
+# Cache directories
+# ------------------------------------------------------------------------------------------
+_dnnlib_cache_dir = None
+def set_cache_dir(path: str) -> None:
+    global _dnnlib_cache_dir
+    _dnnlib_cache_dir = path
+def make_cache_dir_path(*paths: str) -> str:
+    if _dnnlib_cache_dir is not None:
+        return os.path.join(_dnnlib_cache_dir, *paths)
+    if 'DNNLIB_CACHE_DIR' in os.environ:
+        return os.path.join(os.environ['DNNLIB_CACHE_DIR'], *paths)
+    if 'HOME' in os.environ:
+        return os.path.join(os.environ['HOME'], '.cache', 'dnnlib', *paths)
+    if 'USERPROFILE' in os.environ:
+        return os.path.join(os.environ['USERPROFILE'], '.cache', 'dnnlib', *paths)
+    return os.path.join(tempfile.gettempdir(), '.cache', 'dnnlib', *paths)
+# Small util functions
+# ------------------------------------------------------------------------------------------
+def format_time(seconds: Union[int, float]) -> str:
+    """Convert the seconds to human readable string with days, hours, minutes and seconds."""
+    s = int(np.rint(seconds))
+    if s < 60:
+        return "{0}s".format(s)
+    elif s < 60 * 60:
+        return "{0}m {1:02}s".format(s // 60, s % 60)
+    elif s < 24 * 60 * 60:
+        return "{0}h {1:02}m {2:02}s".format(s // (60 * 60), (s // 60) % 60, s % 60)
+    else:
+        return "{0}d {1:02}h {2:02}m".format(s // (24 * 60 * 60), (s // (60 * 60)) % 24, (s // 60) % 60)
+def format_time_brief(seconds: Union[int, float]) -> str:
+    """Convert the seconds to human readable string with days, hours, minutes and seconds."""
+    s = int(np.rint(seconds))
+    if s < 60:
+        return "{0}s".format(s)
+    elif s < 60 * 60:
+        return "{0}m {1:02}s".format(s // 60, s % 60)
+    elif s < 24 * 60 * 60:
+        return "{0}h {1:02}m".format(s // (60 * 60), (s // 60) % 60)
+    else:
+        return "{0}d {1:02}h".format(s // (24 * 60 * 60), (s // (60 * 60)) % 24)
+def ask_yes_no(question: str) -> bool:
+    """Ask the user the question until the user inputs a valid answer."""
+    while True:
+        try:
+            print("{0} [y/n]".format(question))
+            return strtobool(input().lower())
+        except ValueError:
+            pass
+def tuple_product(t: Tuple) -> Any:
+    """Calculate the product of the tuple elements."""
+    result = 1
+    for v in t:
+        result *= v
+    return result
+_str_to_ctype = {
+    "uint8": ctypes.c_ubyte,
+    "uint16": ctypes.c_uint16,
+    "uint32": ctypes.c_uint32,
+    "uint64": ctypes.c_uint64,
+    "int8": ctypes.c_byte,
+    "int16": ctypes.c_int16,
+    "int32": ctypes.c_int32,
+    "int64": ctypes.c_int64,
+    "float32": ctypes.c_float,
+    "float64": ctypes.c_double
+}
+def get_dtype_and_ctype(type_obj: Any) -> Tuple[np.dtype, Any]:
+    """Given a type name string (or an object having a __name__ attribute), return matching Numpy and ctypes types that have the same size in bytes."""
+    type_str = None
+    if isinstance(type_obj, str):
+        type_str = type_obj
+    elif hasattr(type_obj, "__name__"):
+        type_str = type_obj.__name__
+    elif hasattr(type_obj, "name"):
+        type_str = type_obj.name
+    else:
+        raise RuntimeError("Cannot infer type name from input")
+    assert type_str in _str_to_ctype.keys()
+    my_dtype = np.dtype(type_str)
+    my_ctype = _str_to_ctype[type_str]
+    assert my_dtype.itemsize == ctypes.sizeof(my_ctype)
+    return my_dtype, my_ctype
+def is_pickleable(obj: Any) -> bool:
+    try:
+        with io.BytesIO() as stream:
+            pickle.dump(obj, stream)
+        return True
+    except:
+        return False
+# Functionality to import modules/objects by name, and call functions by name
+# ------------------------------------------------------------------------------------------
+def get_module_from_obj_name(obj_name: str) -> Tuple[types.ModuleType, str]:
+    """Searches for the underlying module behind the name to some python object.
+    Returns the module and the object name (original name with module part removed)."""
+    # allow convenience shorthands, substitute them by full names
+    obj_name = re.sub("^np.", "numpy.", obj_name)
+    obj_name = re.sub("^tf.", "tensorflow.", obj_name)
+    # list alternatives for (module_name, local_obj_name)
+    parts = obj_name.split(".")
+    name_pairs = [(".".join(parts[:i]), ".".join(parts[i:])) for i in range(len(parts), 0, -1)]
+    # try each alternative in turn
+    for module_name, local_obj_name in name_pairs:
+        try:
+            module = importlib.import_module(module_name) # may raise ImportError
+            get_obj_from_module(module, local_obj_name) # may raise AttributeError
+            return module, local_obj_name
+        except:
+            pass
+    # maybe some of the modules themselves contain errors?
+    for module_name, _local_obj_name in name_pairs:
+        try:
+            importlib.import_module(module_name) # may raise ImportError
+        except ImportError:
+            if not str(sys.exc_info()[1]).startswith("No module named '" + module_name + "'"):
+                raise
+    # maybe the requested attribute is missing?
+    for module_name, local_obj_name in name_pairs:
+        try:
+            module = importlib.import_module(module_name) # may raise ImportError
+            get_obj_from_module(module, local_obj_name) # may raise AttributeError
+        except ImportError:
+            pass
+    # we are out of luck, but we have no idea why
+    raise ImportError(obj_name)
+def get_obj_from_module(module: types.ModuleType, obj_name: str) -> Any:
+    """Traverses the object name and returns the last (rightmost) python object."""
+    if obj_name == '':
+        return module
+    obj = module
+    for part in obj_name.split("."):
+        obj = getattr(obj, part)
+    return obj
+def get_obj_by_name(name: str) -> Any:
+    """Finds the python object with the given name."""
+    module, obj_name = get_module_from_obj_name(name)
+    return get_obj_from_module(module, obj_name)
+def call_func_by_name(*args, func_name: str = None, **kwargs) -> Any:
+    """Finds the python object with the given name and calls it as a function."""
+    assert func_name is not None
+    func_obj = get_obj_by_name(func_name)
+    assert callable(func_obj)
+    return func_obj(*args, **kwargs)
+def construct_class_by_name(*args, class_name: str = None, **kwargs) -> Any:
+    """Finds the python class with the given name and constructs it with the given arguments."""
+    return call_func_by_name(*args, func_name=class_name, **kwargs)
+def get_module_dir_by_obj_name(obj_name: str) -> str:
+    """Get the directory path of the module containing the given object name."""
+    module, _ = get_module_from_obj_name(obj_name)
+    return os.path.dirname(inspect.getfile(module))
+def is_top_level_function(obj: Any) -> bool:
+    """Determine whether the given object is a top-level function, i.e., defined at module scope using 'def'."""
+    return callable(obj) and obj.__name__ in sys.modules[obj.__module__].__dict__
+def get_top_level_function_name(obj: Any) -> str:
+    """Return the fully-qualified name of a top-level function."""
+    assert is_top_level_function(obj)
+    module = obj.__module__
+    if module == '__main__':
+        module = os.path.splitext(os.path.basename(sys.modules[module].__file__))[0]
+    return module + "." + obj.__name__
+# File system helpers
+# ------------------------------------------------------------------------------------------
+def list_dir_recursively_with_ignore(dir_path: str, ignores: List[str] = None, add_base_to_relative: bool = False) -> List[Tuple[str, str]]:
+    """List all files recursively in a given directory while ignoring given file and directory names.
+    Returns list of tuples containing both absolute and relative paths."""
+    assert os.path.isdir(dir_path)
+    base_name = os.path.basename(os.path.normpath(dir_path))
+    if ignores is None:
+        ignores = []
+    result = []
+    for root, dirs, files in os.walk(dir_path, topdown=True):
+        for ignore_ in ignores:
+            dirs_to_remove = [d for d in dirs if fnmatch.fnmatch(d, ignore_)]
+            # dirs need to be edited in-place
+            for d in dirs_to_remove:
+                dirs.remove(d)
+            files = [f for f in files if not fnmatch.fnmatch(f, ignore_)]
+        absolute_paths = [os.path.join(root, f) for f in files]
+        relative_paths = [os.path.relpath(p, dir_path) for p in absolute_paths]
+        if add_base_to_relative:
+            relative_paths = [os.path.join(base_name, p) for p in relative_paths]
+        assert len(absolute_paths) == len(relative_paths)
+        result += zip(absolute_paths, relative_paths)
+    return result
+def copy_files_and_create_dirs(files: List[Tuple[str, str]]) -> None:
+    """Takes in a list of tuples of (src, dst) paths and copies files.
+    Will create all necessary directories."""
+    for file in files:
+        target_dir_name = os.path.dirname(file[1])
+        # will create all intermediate-level directories
+        if not os.path.exists(target_dir_name):
+            os.makedirs(target_dir_name)
+        shutil.copyfile(file[0], file[1])
+# URL helpers
+# ------------------------------------------------------------------------------------------
+def is_url(obj: Any, allow_file_urls: bool = False) -> bool:
+    """Determine whether the given object is a valid URL string."""
+    if not isinstance(obj, str) or not "://" in obj:
+        return False
+    if allow_file_urls and obj.startswith('file://'):
+        return True
+    try:
+        res = requests.compat.urlparse(obj)
+        if not res.scheme or not res.netloc or not "." in res.netloc:
+            return False
+        res = requests.compat.urlparse(requests.compat.urljoin(obj, "/"))
+        if not res.scheme or not res.netloc or not "." in res.netloc:
+            return False
+    except:
+        return False
+    return True
+def open_url(url: str, cache_dir: str = None, num_attempts: int = 10, verbose: bool = True, return_filename: bool = False, cache: bool = True) -> Any:
+    """Download the given URL and return a binary-mode file object to access the data."""
+    assert num_attempts >= 1
+    assert not (return_filename and (not cache))
+    # Doesn't look like an URL scheme so interpret it as a local filename.
+    if not re.match('^[a-z]+://', url):
+        return url if return_filename else open(url, "rb")
+    # Handle file URLs.  This code handles unusual file:// patterns that
+    # arise on Windows:
+    #
+    # file:///c:/foo.txt
+    #
+    # which would translate to a local '/c:/foo.txt' filename that's
+    # invalid.  Drop the forward slash for such pathnames.
+    #
+    # If you touch this code path, you should test it on both Linux and
+    # Windows.
+    #
+    # Some internet resources suggest using urllib.request.url2pathname() but
+    # but that converts forward slashes to backslashes and this causes
+    # its own set of problems.
+    if url.startswith('file://'):
+        filename = urllib.parse.urlparse(url).path
+        if re.match(r'^/[a-zA-Z]:', filename):
+            filename = filename[1:]
+        return filename if return_filename else open(filename, "rb")
+    assert is_url(url)
+    # Lookup from cache.
+    if cache_dir is None:
+        cache_dir = make_cache_dir_path('downloads')
+    url_md5 = hashlib.md5(url.encode("utf-8")).hexdigest()
+    if cache:
+        cache_files = glob.glob(os.path.join(cache_dir, url_md5 + "_*"))
+        if len(cache_files) == 1:
+            filename = cache_files[0]
+            return filename if return_filename else open(filename, "rb")
+    # Download.
+    url_name = None
+    url_data = None
+    with requests.Session() as session:
+        if verbose:
+            print("Downloading %s ..." % url, end="", flush=True)
+        for attempts_left in reversed(range(num_attempts)):
+            try:
+                with session.get(url) as res:
+                    res.raise_for_status()
+                    if len(res.content) == 0:
+                        raise IOError("No data received")
+                    if len(res.content) < 8192:
+                        content_str = res.content.decode("utf-8")
+                        if "download_warning" in res.headers.get("Set-Cookie", ""):
+                            links = [html.unescape(link) for link in content_str.split('"') if "export=download" in link]
+                            if len(links) == 1:
+                                url = requests.compat.urljoin(url, links[0])
+                                raise IOError("Google Drive virus checker nag")
+                        if "Google Drive - Quota exceeded" in content_str:
+                            raise IOError("Google Drive download quota exceeded -- please try again later")
+                    match = re.search(r'filename="([^"]*)"', res.headers.get("Content-Disposition", ""))
+                    url_name = match[1] if match else url
+                    url_data = res.content
+                    if verbose:
+                        print(" done")
+                    break
+            except KeyboardInterrupt:
+                raise
+            except:
+                if not attempts_left:
+                    if verbose:
+                        print(" failed")
+                    raise
+                if verbose:
+                    print(".", end="", flush=True)
+    # Save to cache.
+    if cache:
+        safe_name = re.sub(r"[^0-9a-zA-Z-._]", "_", url_name)
+        cache_file = os.path.join(cache_dir, url_md5 + "_" + safe_name)
+        temp_file = os.path.join(cache_dir, "tmp_" + uuid.uuid4().hex + "_" + url_md5 + "_" + safe_name)
+        os.makedirs(cache_dir, exist_ok=True)
+        with open(temp_file, "wb") as f:
+            f.write(url_data)
+        os.replace(temp_file, cache_file) # atomic
+        if return_filename:
+            return cache_file
+    # Return data as file object.
+    assert not return_filename
+    return io.BytesIO(url_data)

feature_networks/__pycache__/constants.cpython-39.pyc ADDED Viewed

Binary file (2.06 kB). View file

feature_networks/__pycache__/pretrained_builder.cpython-39.pyc ADDED Viewed

Binary file (8.5 kB). View file

feature_networks/__pycache__/vit.cpython-39.pyc ADDED Viewed

Binary file (8.58 kB). View file

feature_networks/clip/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .clip import *

feature_networks/clip/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (254 Bytes). View file

feature_networks/clip/__pycache__/clip.cpython-39.pyc ADDED Viewed

Binary file (9.19 kB). View file

feature_networks/clip/__pycache__/model.cpython-39.pyc ADDED Viewed

Binary file (15.4 kB). View file

feature_networks/clip/__pycache__/simple_tokenizer.cpython-39.pyc ADDED Viewed

Binary file (5.84 kB). View file

feature_networks/clip/bpe_simple_vocab_16e6.txt.gz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:924691ac288e54409236115652ad4aa250f48203de50a9e4722a6ecd48d6804a
+size 1356917

feature_networks/clip/clip.py ADDED Viewed

	@@ -0,0 +1,244 @@

+import hashlib
+import os
+import urllib
+import warnings
+from typing import Union, List
+import torch
+import torch.nn as nn
+from PIL import Image
+from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
+from tqdm import tqdm
+from .model import build_model
+from .simple_tokenizer import SimpleTokenizer as _Tokenizer
+__all__ = ["available_models", "load", "tokenize"]
+_tokenizer = _Tokenizer()
+_MODELS = {
+    "RN50": "https://openaipublic.azureedge.net/clip/models/afeb0e10f9e5a86da6080e35cf09123aca3b358a0c3e3b6c78a7b63bc04b6762/RN50.pt",
+    "RN101": "https://openaipublic.azureedge.net/clip/models/8fa8567bab74a42d41c5915025a8e4538c3bdbe8804a470a72f30b0d94fab599/RN101.pt",
+    "RN50x4": "https://openaipublic.azureedge.net/clip/models/7e526bd135e493cef0776de27d5f42653e6b4c8bf9e0f653bb11773263205fdd/RN50x4.pt",
+    "ViT-B/32": "https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt",
+}
+def _download(url: str, root: str = os.path.expanduser("~/.cache/clip")):
+    os.makedirs(root, exist_ok=True)
+    filename = os.path.basename(url)
+    expected_sha256 = url.split("/")[-2]
+    download_target = os.path.join(root, filename)
+    if os.path.exists(download_target) and not os.path.isfile(download_target):
+        raise RuntimeError(f"{download_target} exists and is not a regular file")
+    if os.path.isfile(download_target):
+        if hashlib.sha256(open(download_target, "rb").read()).hexdigest() == expected_sha256:
+            return download_target
+        else:
+            warnings.warn(f"{download_target} exists, but the SHA256 checksum does not match; re-downloading the file")
+    with urllib.request.urlopen(url) as source, open(download_target, "wb") as output:
+        with tqdm(total=int(source.info().get("Content-Length")), ncols=80, unit='iB', unit_scale=True) as loop:
+            while True:
+                buffer = source.read(8192)
+                if not buffer:
+                    break
+                output.write(buffer)
+                loop.update(len(buffer))
+    if hashlib.sha256(open(download_target, "rb").read()).hexdigest() != expected_sha256:
+        raise RuntimeError(f"Model has been downloaded but the SHA256 checksum does not not match")
+    return download_target
+def _transform(n_px):
+    return Compose([
+        Resize(n_px, interpolation=Image.BICUBIC),
+        CenterCrop(n_px),
+        lambda image: image.convert("RGB"),
+        ToTensor(),
+        Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
+    ])
+def available_models() -> List[str]:
+    """Returns the names of available CLIP models"""
+    return list(_MODELS.keys())
+def load(name: str, device: Union[str, torch.device] = "cuda" if torch.cuda.is_available() else "cpu", jit=True):
+    """Load a CLIP model
+    Parameters
+    ----------
+    name : str
+        A model name listed by `clip.available_models()`, or the path to a model checkpoint containing the state_dict
+    device : Union[str, torch.device]
+        The device to put the loaded model
+    jit : bool
+        Whether to load the optimized JIT model (default) or more hackable non-JIT model.
+    Returns
+    -------
+    model : torch.nn.Module
+        The CLIP model
+    preprocess : Callable[[PIL.Image], torch.Tensor]
+        A torchvision transform that converts a PIL image into a tensor that the returned model can take as its input
+    """
+    if name in _MODELS:
+        model_path = _download(_MODELS[name])
+    elif os.path.isfile(name):
+        model_path = name
+    else:
+        raise RuntimeError(f"Model {name} not found; available models = {available_models()}")
+    try:
+        # loading JIT archive
+        model = torch.jit.load(model_path, map_location=device if jit else "cpu").eval()
+        state_dict = None
+    except RuntimeError:
+        # loading saved state dict
+        if jit:
+            warnings.warn(f"File {model_path} is not a JIT archive. Loading as a state dict instead")
+            jit = False
+        state_dict = torch.load(model_path, map_location="cpu")
+    if not jit:
+        model = build_model(state_dict or model.state_dict()).to(device)
+        if str(device) == "cpu":
+            model.float()
+        return model, _transform(model.visual.input_resolution)
+    # patch the device names
+    device_holder = torch.jit.trace(lambda: torch.ones([]).to(torch.device(device)), example_inputs=[])
+    device_node = [n for n in device_holder.graph.findAllNodes("prim::Constant") if "Device" in repr(n)][-1]
+    def patch_device(module):
+        graphs = [module.graph] if hasattr(module, "graph") else []
+        if hasattr(module, "forward1"):
+            graphs.append(module.forward1.graph)
+        for graph in graphs:
+            for node in graph.findAllNodes("prim::Constant"):
+                if "value" in node.attributeNames() and str(node["value"]).startswith("cuda"):
+                    node.copyAttributes(device_node)
+    model.apply(patch_device)
+    patch_device(model.encode_image)
+    patch_device(model.encode_text)
+    # patch dtype to float32 on CPU
+    if str(device) == "cpu":
+        float_holder = torch.jit.trace(lambda: torch.ones([]).float(), example_inputs=[])
+        float_input = list(float_holder.graph.findNode("aten::to").inputs())[1]
+        float_node = float_input.node()
+        def patch_float(module):
+            graphs = [module.graph] if hasattr(module, "graph") else []
+            if hasattr(module, "forward1"):
+                graphs.append(module.forward1.graph)
+            for graph in graphs:
+                for node in graph.findAllNodes("aten::to"):
+                    inputs = list(node.inputs())
+                    for i in [1, 2]:  # dtype can be the second or third argument to aten::to()
+                        if inputs[i].node()["value"] == 5:
+                            inputs[i].node().copyAttributes(float_node)
+        model.apply(patch_float)
+        patch_float(model.encode_image)
+        patch_float(model.encode_text)
+        model.float()
+    return model, _transform(model.input_resolution.item())
+def tokenize(texts: Union[str, List[str]], context_length: int = 77) -> torch.LongTensor:
+    """
+    Returns the tokenized representation of given input string(s)
+    Parameters
+    ----------
+    texts : Union[str, List[str]]
+        An input string or a list of input strings to tokenize
+    context_length : int
+        The context length to use; all CLIP models use 77 as the context length
+    Returns
+    -------
+    A two-dimensional tensor containing the resulting tokens, shape = [number of input strings, context_length]
+    """
+    if isinstance(texts, str):
+        texts = [texts]
+    sot_token = _tokenizer.encoder["<|startoftext|>"]
+    eot_token = _tokenizer.encoder["<|endoftext|>"]
+    all_tokens = [[sot_token] + _tokenizer.encode(text) + [eot_token] for text in texts]
+    result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
+    for i, tokens in enumerate(all_tokens):
+        if len(tokens) > context_length:
+            raise RuntimeError(f"Input {texts[i]} is too long for context length {context_length}")
+        result[i, :len(tokens)] = torch.tensor(tokens)
+    return result
+def pdist(sample_1, sample_2, norm=2, eps=1e-5):
+    r"""Compute the matrix of all squared pairwise distances.
+    Arguments
+    ---------
+    sample_1 : torch.Tensor or Variable
+        The first sample, should be of shape ``(n_1, d)``.
+    sample_2 : torch.Tensor or Variable
+        The second sample, should be of shape ``(n_2, d)``.
+    norm : float
+        The l_p norm to be used.
+    Returns
+    -------
+    torch.Tensor or Variable
+        Matrix of shape (n_1, n_2). The [i, j]-th entry is equal to
+        ``|| sample_1[i, :] - sample_2[j, :] ||_p``."""
+    n_1, n_2 = sample_1.size(0), sample_2.size(0)
+    norm = float(norm)
+    if norm == 2.:
+        norms_1 = torch.sum(sample_1**2, dim=1, keepdim=True)
+        norms_2 = torch.sum(sample_2**2, dim=1, keepdim=True)
+        norms = (norms_1.expand(n_1, n_2) +
+                 norms_2.transpose(0, 1).expand(n_1, n_2))
+        distances_squared = norms - 2 * sample_1.mm(sample_2.t())
+        return torch.sqrt(eps + torch.abs(distances_squared))
+    else:
+        dim = sample_1.size(1)
+        expanded_1 = sample_1.unsqueeze(1).expand(n_1, n_2, dim)
+        expanded_2 = sample_2.unsqueeze(0).expand(n_1, n_2, dim)
+        differences = torch.abs(expanded_1 - expanded_2) ** norm
+        inner = torch.sum(differences, dim=2, keepdim=False)
+        return (eps + inner) ** (1. / norm)
+class ClipHead(nn.Module):
+    def __init__(self, prompt, device='cpu'):
+        super().__init__()
+        self.clip_model = load("RN50", device=device, jit=False)[0].eval()
+        self.prompt = prompt
+    def calc_loss(self, features):
+        dev = features['last'].get_device()
+        text_input = tokenize(self.prompt).to(dev)
+        text_features = self.clip_model.encode_text(text_input)
+        image_features = self.clip_model.encode_conv_features(features['last'])
+        loss = - torch.cosine_similarity(text_features, image_features, dim=1)
+        # loss -= (pdist(image_features, image_features)/image_features.max()).sum()
+        return loss.mean()

feature_networks/clip/model.py ADDED Viewed

	@@ -0,0 +1,453 @@

+from collections import OrderedDict
+from typing import Tuple, Union
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+class Bottleneck(nn.Module):
+    expansion = 4
+    def __init__(self, inplanes, planes, stride=1):
+        super().__init__()
+        # all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1
+        self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity()
+        self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = None
+        self.stride = stride
+        if stride > 1 or inplanes != planes * Bottleneck.expansion:
+            # downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1
+            self.downsample = nn.Sequential(OrderedDict([
+                ("-1", nn.AvgPool2d(stride)),
+                ("0", nn.Conv2d(inplanes, planes * self.expansion, 1, stride=1, bias=False)),
+                ("1", nn.BatchNorm2d(planes * self.expansion))
+            ]))
+    def forward(self, x: torch.Tensor):
+        identity = x
+        out = self.relu(self.bn1(self.conv1(x)))
+        out = self.relu(self.bn2(self.conv2(out)))
+        out = self.avgpool(out)
+        out = self.bn3(self.conv3(out))
+        if self.downsample is not None:
+            identity = self.downsample(x)
+        out += identity
+        out = self.relu(out)
+        return out
+class AttentionPool2d(nn.Module):
+    def __init__(self, spacial_dim: int, embed_dim: int, num_heads: int, output_dim: int = None):
+        super().__init__()
+        self.positional_embedding = nn.Parameter(torch.randn(spacial_dim ** 2 + 1, embed_dim) / embed_dim ** 0.5)
+        self.k_proj = nn.Linear(embed_dim, embed_dim)
+        self.q_proj = nn.Linear(embed_dim, embed_dim)
+        self.v_proj = nn.Linear(embed_dim, embed_dim)
+        self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim)
+        self.num_heads = num_heads
+    def forward(self, x):
+        x = x.reshape(x.shape[0], x.shape[1], x.shape[2] * x.shape[3]).permute(2, 0, 1)  # NCHW -> (HW)NC
+        x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0)  # (HW+1)NC
+        x = x + self.positional_embedding[:, None, :].to(x.dtype)  # (HW+1)NC
+        x, _ = F.multi_head_attention_forward(
+            query=x, key=x, value=x,
+            embed_dim_to_check=x.shape[-1],
+            num_heads=self.num_heads,
+            q_proj_weight=self.q_proj.weight,
+            k_proj_weight=self.k_proj.weight,
+            v_proj_weight=self.v_proj.weight,
+            in_proj_weight=None,
+            in_proj_bias=torch.cat([self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]),
+            bias_k=None,
+            bias_v=None,
+            add_zero_attn=False,
+            dropout_p=0,
+            out_proj_weight=self.c_proj.weight,
+            out_proj_bias=self.c_proj.bias,
+            use_separate_proj_weight=True,
+            training=self.training,
+            need_weights=False
+        )
+        return x[0]
+class ModifiedResNet(nn.Module):
+    """
+    A ResNet class that is similar to torchvision's but contains the following changes:
+    - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
+    - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
+    - The final pooling layer is a QKV attention instead of an average pool
+    """
+    def __init__(self, layers, output_dim, heads, input_resolution=224, width=64):
+        super().__init__()
+        self.output_dim = output_dim
+        self.input_resolution = input_resolution
+        # the 3-layer stem
+        self.conv1 = nn.Conv2d(3, width // 2, kernel_size=3, stride=2, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(width // 2)
+        self.conv2 = nn.Conv2d(width // 2, width // 2, kernel_size=3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(width // 2)
+        self.conv3 = nn.Conv2d(width // 2, width, kernel_size=3, padding=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(width)
+        self.avgpool = nn.AvgPool2d(2)
+        self.relu = nn.ReLU(inplace=True)
+        # residual layers
+        self._inplanes = width  # this is a *mutable* variable used during construction
+        self.layer1 = self._make_layer(width, layers[0])
+        self.layer2 = self._make_layer(width * 2, layers[1], stride=2)
+        self.layer3 = self._make_layer(width * 4, layers[2], stride=2)
+        self.layer4 = self._make_layer(width * 8, layers[3], stride=2)
+        embed_dim = width * 32  # the ResNet feature dimension
+        self.attnpool = AttentionPool2d(input_resolution // 32, embed_dim, heads, output_dim)
+    def _make_layer(self, planes, blocks, stride=1):
+        layers = [Bottleneck(self._inplanes, planes, stride)]
+        self._inplanes = planes * Bottleneck.expansion
+        for _ in range(1, blocks):
+            layers.append(Bottleneck(self._inplanes, planes))
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        def stem(x):
+            for conv, bn in [(self.conv1, self.bn1), (self.conv2, self.bn2), (self.conv3, self.bn3)]:
+                x = self.relu(bn(conv(x)))
+            x = self.avgpool(x)
+            return x
+        x = x.type(self.conv1.weight.dtype)
+        x = stem(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = self.attnpool(x)
+        return x
+class LayerNorm(nn.LayerNorm):
+    """Subclass torch's LayerNorm to handle fp16."""
+    def forward(self, x: torch.Tensor):
+        orig_type = x.dtype
+        ret = super().forward(x.type(torch.float32))
+        return ret.type(orig_type)
+class QuickGELU(nn.Module):
+    def forward(self, x: torch.Tensor):
+        return x * torch.sigmoid(1.702 * x)
+class ResidualAttentionBlock(nn.Module):
+    def __init__(self, d_model: int, n_head: int, attn_mask: torch.Tensor = None):
+        super().__init__()
+        self.attn = nn.MultiheadAttention(d_model, n_head)
+        self.ln_1 = LayerNorm(d_model)
+        self.mlp = nn.Sequential(OrderedDict([
+            ("c_fc", nn.Linear(d_model, d_model * 4)),
+            ("gelu", QuickGELU()),
+            ("c_proj", nn.Linear(d_model * 4, d_model))
+        ]))
+        self.ln_2 = LayerNorm(d_model)
+        self.attn_mask = attn_mask
+    def attention(self, x: torch.Tensor):
+        self.attn_mask = self.attn_mask.to(dtype=x.dtype, device=x.device) if self.attn_mask is not None else None
+        return self.attn(x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]
+    def forward(self, x: torch.Tensor):
+        x = x + self.attention(self.ln_1(x))
+        x = x + self.mlp(self.ln_2(x))
+        return x
+class Transformer(nn.Module):
+    def __init__(self, width: int, layers: int, heads: int, attn_mask: torch.Tensor = None):
+        super().__init__()
+        self.width = width
+        self.layers = layers
+        self.resblocks = nn.Sequential(*[ResidualAttentionBlock(width, heads, attn_mask) for _ in range(layers)])
+    def forward(self, x: torch.Tensor):
+        return self.resblocks(x)
+class VisualTransformer(nn.Module):
+    def __init__(self, input_resolution: int, patch_size: int, width: int, layers: int, heads: int, output_dim: int):
+        super().__init__()
+        self.input_resolution = input_resolution
+        self.output_dim = output_dim
+        self.conv1 = nn.Conv2d(in_channels=3, out_channels=width, kernel_size=patch_size, stride=patch_size, bias=False)
+        scale = width ** -0.5
+        self.class_embedding = nn.Parameter(scale * torch.randn(width))
+        self.positional_embedding = nn.Parameter(scale * torch.randn((input_resolution // patch_size) ** 2 + 1, width))
+        self.ln_pre = LayerNorm(width)
+        self.transformer = Transformer(width, layers, heads)
+        self.ln_post = LayerNorm(width)
+        self.proj = nn.Parameter(scale * torch.randn(width, output_dim))
+    def forward(self, x: torch.Tensor):
+        x = self.conv1(x)  # shape = [*, width, grid, grid]
+        x = x.reshape(x.shape[0], x.shape[1], -1)  # shape = [*, width, grid ** 2]
+        x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
+        x = torch.cat([self.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x], dim=1)  # shape = [*, grid ** 2 + 1, width]
+        x = x + self.positional_embedding.to(x.dtype)
+        x = self.ln_pre(x)
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.transformer(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.ln_post(x[:, 0, :])
+        if self.proj is not None:
+            x = x @ self.proj
+        return x
+class CLIP(nn.Module):
+    def __init__(self,
+                 embed_dim: int,
+                 # vision
+                 image_resolution: int,
+                 vision_layers: Union[Tuple[int, int, int, int], int],
+                 vision_width: int,
+                 vision_patch_size: int,
+                 # text
+                 context_length: int,
+                 vocab_size: int,
+                 transformer_width: int,
+                 transformer_heads: int,
+                 transformer_layers: int
+                 ):
+        super().__init__()
+        self.context_length = context_length
+        if isinstance(vision_layers, (tuple, list)):
+            vision_heads = vision_width * 32 // 64
+            self.visual = ModifiedResNet(
+                layers=vision_layers,
+                output_dim=embed_dim,
+                heads=vision_heads,
+                input_resolution=image_resolution,
+                width=vision_width
+            )
+        else:
+            vision_heads = vision_width // 64
+            self.visual = VisualTransformer(
+                input_resolution=image_resolution,
+                patch_size=vision_patch_size,
+                width=vision_width,
+                layers=vision_layers,
+                heads=vision_heads,
+                output_dim=embed_dim
+            )
+        self.transformer = Transformer(
+            width=transformer_width,
+            layers=transformer_layers,
+            heads=transformer_heads,
+            attn_mask=self.build_attention_mask()
+        )
+        self.vocab_size = vocab_size
+        self.token_embedding = nn.Embedding(vocab_size, transformer_width)
+        self.positional_embedding = nn.Parameter(torch.empty(self.context_length, transformer_width))
+        self.ln_final = LayerNorm(transformer_width)
+        self.text_projection = nn.Parameter(torch.empty(transformer_width, embed_dim))
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+        self.initialize_parameters()
+    def initialize_parameters(self):
+        nn.init.normal_(self.token_embedding.weight, std=0.02)
+        nn.init.normal_(self.positional_embedding, std=0.01)
+        if isinstance(self.visual, ModifiedResNet):
+            if self.visual.attnpool is not None:
+                std = self.visual.attnpool.c_proj.in_features ** -0.5
+                nn.init.normal_(self.visual.attnpool.q_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.k_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.v_proj.weight, std=std)
+                nn.init.normal_(self.visual.attnpool.c_proj.weight, std=std)
+            for resnet_block in [self.visual.layer1, self.visual.layer2, self.visual.layer3, self.visual.layer4]:
+                for name, param in resnet_block.named_parameters():
+                    if name.endswith("bn3.weight"):
+                        nn.init.zeros_(param)
+        proj_std = (self.transformer.width ** -0.5) * ((2 * self.transformer.layers) ** -0.5)
+        attn_std = self.transformer.width ** -0.5
+        fc_std = (2 * self.transformer.width) ** -0.5
+        for block in self.transformer.resblocks:
+            nn.init.normal_(block.attn.in_proj_weight, std=attn_std)
+            nn.init.normal_(block.attn.out_proj.weight, std=proj_std)
+            nn.init.normal_(block.mlp.c_fc.weight, std=fc_std)
+            nn.init.normal_(block.mlp.c_proj.weight, std=proj_std)
+        if self.text_projection is not None:
+            nn.init.normal_(self.text_projection, std=self.transformer.width ** -0.5)
+    def build_attention_mask(self):
+        # lazily create causal attention mask, with full attention between the vision tokens
+        # pytorch uses additive attention mask; fill with -inf
+        mask = torch.empty(self.context_length, self.context_length)
+        mask.fill_(float("-inf"))
+        mask.triu_(1)  # zero out the lower diagonal
+        return mask
+    @property
+    def dtype(self):
+        return self.visual.conv1.weight.dtype
+    def encode_image(self, image):
+        return self.visual(image.type(self.dtype))
+    def encode_text(self, text):
+        x = self.token_embedding(text).type(self.dtype)  # [batch_size, n_ctx, d_model]
+        x = x + self.positional_embedding.type(self.dtype)
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.transformer(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.ln_final(x).type(self.dtype)
+        # x.shape = [batch_size, n_ctx, transformer.width]
+        # take features from the eot embedding (eot_token is the highest number in each sequence)
+        x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ self.text_projection
+        return x
+    def encode_conv_features(self, features):
+        # pool to 7, the feature map resolution for 224x224 input
+        features = nn.AdaptiveAvgPool2d(7)(features)
+        return self.visual.attnpool(features)
+    def forward(self, image, text):
+        image_features = self.encode_image(image)
+        text_features = self.encode_text(text)
+        # normalized features
+        image_features = image_features / image_features.norm(dim=-1, keepdim=True)
+        text_features = text_features / text_features.norm(dim=-1, keepdim=True)
+        # cosine similarity as logits
+        logit_scale = self.logit_scale.exp()
+        logits_per_image = logit_scale * image_features @ text_features.t()
+        logits_per_text = logit_scale * text_features @ image_features.t()
+        # shape = [global_batch_size, global_batch_size]
+        return logits_per_image, logits_per_text
+    def forward_features(self, features, text):
+        image_features = self.encode_conv_features(features)
+        text_features = self.encode_text(text)
+        # normalized features
+        image_features = image_features / image_features.norm(dim=-1, keepdim=True)
+        text_features = text_features / text_features.norm(dim=-1, keepdim=True)
+        # cosine similarity as logits
+        logit_scale = self.logit_scale.exp()
+        logits_per_image = logit_scale * image_features @ text_features.t()
+        logits_per_text = logit_scale * text_features @ image_features.t()
+        # shape = [global_batch_size, global_batch_size]
+        return logits_per_image, logits_per_text
+def convert_weights(model: nn.Module):
+    """Convert applicable model parameters to fp16"""
+    def _convert_weights_to_fp16(l):
+        if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Linear)):
+            l.weight.data = l.weight.data.half()
+            if l.bias is not None:
+                l.bias.data = l.bias.data.half()
+        if isinstance(l, nn.MultiheadAttention):
+            for attr in [*[f"{s}_proj_weight" for s in ["in", "q", "k", "v"]], "in_proj_bias", "bias_k", "bias_v"]:
+                tensor = getattr(l, attr)
+                if tensor is not None:
+                    tensor.data = tensor.data.half()
+        for name in ["text_projection", "proj"]:
+            if hasattr(l, name):
+                attr = getattr(l, name)
+                if attr is not None:
+                    attr.data = attr.data.half()
+    model.apply(_convert_weights_to_fp16)
+def build_model(state_dict: dict):
+    vit = "visual.proj" in state_dict
+    if vit:
+        vision_width = state_dict["visual.conv1.weight"].shape[0]
+        vision_layers = len([k for k in state_dict.keys() if k.startswith("visual.") and k.endswith(".attn.in_proj_weight")])
+        vision_patch_size = state_dict["visual.conv1.weight"].shape[-1]
+        grid_size = round((state_dict["visual.positional_embedding"].shape[0] - 1) ** 0.5)
+        image_resolution = vision_patch_size * grid_size
+    else:
+        counts: list = [len(set(k.split(".")[2] for k in state_dict if k.startswith(f"visual.layer{b}"))) for b in [1, 2, 3, 4]]
+        vision_layers = tuple(counts)
+        vision_width = state_dict["visual.layer1.0.conv1.weight"].shape[0]
+        output_width = round((state_dict["visual.attnpool.positional_embedding"].shape[0] - 1) ** 0.5)
+        vision_patch_size = None
+        assert output_width ** 2 + 1 == state_dict["visual.attnpool.positional_embedding"].shape[0]
+        image_resolution = output_width * 32
+    embed_dim = state_dict["text_projection"].shape[1]
+    context_length = state_dict["positional_embedding"].shape[0]
+    vocab_size = state_dict["token_embedding.weight"].shape[0]
+    transformer_width = state_dict["ln_final.weight"].shape[0]
+    transformer_heads = transformer_width // 64
+    transformer_layers = len(set(k.split(".")[2] for k in state_dict if k.startswith(f"transformer.resblocks")))
+    model = CLIP(
+        embed_dim,
+        image_resolution, vision_layers, vision_width, vision_patch_size,
+        context_length, vocab_size, transformer_width, transformer_heads, transformer_layers
+    )
+    for key in ["input_resolution", "context_length", "vocab_size"]:
+        if key in state_dict:
+            del state_dict[key]
+    # convert_weights(model)
+    model.load_state_dict(state_dict)
+    return model.eval()

feature_networks/clip/simple_tokenizer.py ADDED Viewed

	@@ -0,0 +1,132 @@

+import gzip
+import html
+import os
+from functools import lru_cache
+import ftfy
+import regex as re
+@lru_cache()
+def default_bpe():
+    return os.path.join(os.path.dirname(os.path.abspath(__file__)), "bpe_simple_vocab_16e6.txt.gz")
+@lru_cache()
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8+n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+def get_pairs(word):
+    """Return set of symbol pairs in a word.
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+def basic_clean(text):
+    text = ftfy.fix_text(text)
+    text = html.unescape(html.unescape(text))
+    return text.strip()
+def whitespace_clean(text):
+    text = re.sub(r'\s+', ' ', text)
+    text = text.strip()
+    return text
+class SimpleTokenizer(object):
+    def __init__(self, bpe_path: str = default_bpe()):
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        merges = gzip.open(bpe_path).read().decode("utf-8").split('\n')
+        merges = merges[1:49152-256-2+1]
+        merges = [tuple(merge.split()) for merge in merges]
+        vocab = list(bytes_to_unicode().values())
+        vocab = vocab + [v+'</w>' for v in vocab]
+        for merge in merges:
+            vocab.append(''.join(merge))
+        vocab.extend(['<|startoftext|>', '<|endoftext|>'])
+        self.encoder = dict(zip(vocab, range(len(vocab))))
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.bpe_ranks = dict(zip(merges, range(len(merges))))
+        self.cache = {'<|startoftext|>': '<|startoftext|>', '<|endoftext|>': '<|endoftext|>'}
+        self.pat = re.compile(r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", re.IGNORECASE)
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token[:-1]) + ( token[-1] + '</w>',)
+        pairs = get_pairs(word)
+        if not pairs:
+            return token+'</w>'
+        while True:
+            bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except:
+                    new_word.extend(word[i:])
+                    break
+                if word[i] == first and i < len(word)-1 and word[i+1] == second:
+                    new_word.append(first+second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = ' '.join(word)
+        self.cache[token] = word
+        return word
+    def encode(self, text):
+        bpe_tokens = []
+        text = whitespace_clean(basic_clean(text)).lower()
+        for token in re.findall(self.pat, text):
+            token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
+            bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
+        return bpe_tokens
+    def decode(self, tokens):
+        text = ''.join([self.decoder[token] for token in tokens])
+        text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors="replace").replace('</w>', ' ')
+        return text

feature_networks/constants.py ADDED Viewed

	@@ -0,0 +1,129 @@

+TORCHVISION = [
+    "vgg11_bn",
+    "vgg13_bn",
+    "vgg16",
+    "vgg16_bn",
+    "vgg19_bn",
+    "densenet121",
+    "densenet169",
+    "densenet201",
+    "inception_v3",
+    "resnet18",
+    "resnet34",
+    "resnet50",
+    "resnet101",
+    "resnet152",
+    "shufflenet_v2_x0_5",
+    "mobilenet_v2",
+    "wide_resnet50_2",
+    "mnasnet0_5",
+    "mnasnet1_0",
+    "ghostnet_100",
+    "cspresnet50",
+    "fbnetc_100",
+    "spnasnet_100",
+    "resnet50d",
+    "resnet26",
+    "resnet26d",
+    "seresnet50",
+    "resnetblur50",
+    "resnetrs50",
+    "tf_mixnet_s",
+    "tf_mixnet_m",
+    "tf_mixnet_l",
+    "ese_vovnet19b_dw",
+    "ese_vovnet39b",
+    "res2next50",
+    "gernet_s",
+    "gernet_m",
+    "repvgg_a2",
+    "repvgg_b0",
+    "repvgg_b1",
+    "repvgg_b1g4",
+    "revnet",
+    "dm_nfnet_f1",
+    "nfnet_l0",
+]
+REGNETS = [
+    "regnetx_002",
+    "regnetx_004",
+    "regnetx_006",
+    "regnetx_008",
+    "regnetx_016",
+    "regnetx_032",
+    "regnetx_040",
+    "regnetx_064",
+    "regnety_002",
+    "regnety_004",
+    "regnety_006",
+    "regnety_008",
+    "regnety_016",
+    "regnety_032",
+    "regnety_040",
+    "regnety_064",
+]
+EFFNETS_IMAGENET = [
+    'tf_efficientnet_b0',
+    'tf_efficientnet_b1',
+    'tf_efficientnet_b2',
+    'tf_efficientnet_b3',
+    'tf_efficientnet_b4',
+    'tf_efficientnet_b0_ns',
+]
+EFFNETS_INCEPTION = [
+    'tf_efficientnet_lite0',
+    'tf_efficientnet_lite1',
+    'tf_efficientnet_lite2',
+    'tf_efficientnet_lite3',
+    'tf_efficientnet_lite4',
+    'tf_efficientnetv2_b0',
+    'tf_efficientnetv2_b1',
+    'tf_efficientnetv2_b2',
+    'tf_efficientnetv2_b3',
+    'efficientnet_b1',
+    'efficientnet_b1_pruned',
+    'efficientnet_b2_pruned',
+    'efficientnet_b3_pruned',
+]
+EFFNETS = EFFNETS_IMAGENET + EFFNETS_INCEPTION
+VITS_IMAGENET = [
+    'deit_tiny_distilled_patch16_224',
+    'deit_small_distilled_patch16_224',
+    'deit_base_distilled_patch16_224',
+]
+VITS_INCEPTION = [
+    'vit_base_patch16_224',
+    'vit_large_patch16_224'
+]
+MAES = [
+    'mae_vit_base_patch16',
+    'mae_vit_large_patch16',
+    'mae_vit_huge_patch14'
+]
+ST= ['swin_base_patch4_window7_224']
+TNT = ['tnt_b_patch16_224']
+VITS = VITS_IMAGENET + VITS_INCEPTION
+CLIP = [
+    'resnet50_clip'
+]
+ALL_MODELS = TORCHVISION + REGNETS + EFFNETS + VITS + CLIP + MAES + TNT
+# Group according to input normalization
+NORMALIZED_IMAGENET = TORCHVISION + REGNETS + EFFNETS_IMAGENET + VITS_IMAGENET
+NORMALIZED_INCEPTION = EFFNETS_INCEPTION + VITS_INCEPTION + MAES + TNT + ST
+NORMALIZED_CLIP = CLIP

feature_networks/pretrained_builder.py ADDED Viewed

	@@ -0,0 +1,417 @@

+import numpy as np
+import torch
+import torch.nn as nn
+import torchvision.models as zoomodels
+from torch.autograd import Function
+import timm
+from feature_networks import clip
+from feature_networks.vit import _make_vit_b16_backbone, forward_vit
+from feature_networks.constants import ALL_MODELS, VITS, EFFNETS, REGNETS
+from pg_modules.blocks import Interpolate
+def _feature_splitter(model, idcs):
+    pretrained = nn.Module()
+    pretrained.layer0 = nn.Sequential(model.features[:idcs[0]])
+    pretrained.layer1 = nn.Sequential(model.features[idcs[0]:idcs[1]])
+    pretrained.layer2 = nn.Sequential(model.features[idcs[1]:idcs[2]])
+    pretrained.layer3 = nn.Sequential(model.features[idcs[2]:idcs[3]])
+    return pretrained
+def _make_resnet(model):
+    pretrained = nn.Module()
+    pretrained.layer0 = nn.Sequential(
+        model.conv1, model.bn1, model.relu, model.maxpool, model.layer1,
+    )
+    pretrained.layer1 = model.layer2
+    pretrained.layer2 = model.layer3
+    pretrained.layer3 = model.layer4
+    return pretrained
+def _make_regnet(model):
+    pretrained = nn.Module()
+    pretrained.layer0 = nn.Sequential(
+        model.stem, model.s1
+    )
+    pretrained.layer1 = model.s2
+    pretrained.layer2 = model.s3
+    pretrained.layer3 = model.s4
+    return pretrained
+def _make_nfnet(model):
+    pretrained = nn.Module()
+    pretrained.layer0 = nn.Sequential(
+        model.stem, model.stages[0]
+    )
+    pretrained.layer1 = model.stages[1]
+    pretrained.layer2 = model.stages[2]
+    pretrained.layer3 = model.stages[3]
+    return pretrained
+def _make_resnet_v2(model):
+    pretrained = nn.Module()
+    pretrained.layer0 = nn.Sequential(model.stem, model.stages[0])
+    pretrained.layer1 = model.stages[1]
+    pretrained.layer2 = model.stages[2]
+    pretrained.layer3 = model.stages[3]
+    return pretrained
+def _make_resnet_clip(model):
+    pretrained = nn.Module()
+    # slightly more complicated than the standard resnet
+    pretrained.layer0 = nn.Sequential(
+        model.conv1,
+        model.bn1,
+        model.relu,
+        model.conv2,
+        model.bn2,
+        model.relu,
+        model.conv3,
+        model.bn3,
+        model.relu,
+        model.avgpool,
+        model.layer1,
+    )
+    pretrained.layer1 = model.layer2
+    pretrained.layer2 = model.layer3
+    pretrained.layer3 = model.layer4
+    return pretrained
+def _make_densenet(model):
+    pretrained = nn.Module()
+    pretrained.layer0 = model.features[:6]
+    pretrained.layer1 = model.features[6:8]
+    pretrained.layer1[-1][-1] = nn.Identity()
+    pretrained.layer1 = nn.Sequential(nn.AvgPool2d(2, 2), pretrained.layer1)
+    pretrained.layer2 = model.features[8:10]
+    pretrained.layer2[-1][-1] = nn.Identity()
+    pretrained.layer2 = nn.Sequential(nn.AvgPool2d(2, 2), pretrained.layer2)
+    pretrained.layer3 = model.features[10:12]
+    pretrained.layer3 = nn.Sequential(nn.AvgPool2d(2, 2), pretrained.layer3)
+    return pretrained
+def _make_shufflenet(model):
+    pretrained = nn.Module()
+    pretrained.layer0 = nn.Sequential(model.conv1, model.maxpool)
+    pretrained.layer1 = model.stage2
+    pretrained.layer2 = model.stage3
+    pretrained.layer3 = model.stage4
+    return pretrained
+def _make_cspresnet(model):
+    pretrained = nn.Module()
+    pretrained.layer0 = nn.Sequential(model.stem, model.stages[0])
+    pretrained.layer1 = model.stages[1]
+    pretrained.layer2 = model.stages[2]
+    pretrained.layer3 = model.stages[3]
+    return pretrained
+def _make_efficientnet(model):
+    pretrained = nn.Module()
+    pretrained.layer0 = nn.Sequential(
+        model.conv_stem, model.bn1, model.act1, *model.blocks[0:2]
+    )
+    pretrained.layer1 = nn.Sequential(*model.blocks[2:3])
+    pretrained.layer2 = nn.Sequential(*model.blocks[3:5])
+    pretrained.layer3 = nn.Sequential(*model.blocks[5:9])
+    return pretrained
+def _make_ghostnet(model):
+    pretrained = nn.Module()
+    pretrained.layer0 = nn.Sequential(
+        model.conv_stem, model.bn1, model.act1, *model.blocks[0:3],
+    )
+    pretrained.layer1 = nn.Sequential(*model.blocks[3:5])
+    pretrained.layer2 = nn.Sequential(*model.blocks[5:7])
+    pretrained.layer3 = nn.Sequential(*model.blocks[7:-1])
+    return pretrained
+def _make_vit(model, name):
+    if 'tiny' in name:
+        features = [24, 48, 96, 192]
+        hooks = [2, 5, 8, 11]
+        vit_features = 192
+    elif 'small' in name:
+        features = [48, 96, 192, 384]
+        hooks = [2, 5, 8, 11]
+        vit_features = 384
+    elif 'base' in name:
+        features = [96, 192, 384, 768]
+        hooks = [2, 5, 8, 11]
+        vit_features = 768
+    elif 'large' in name:
+        features = [256, 512, 1024, 1024]
+        hooks = [5, 11, 17, 23]
+        vit_features = 1024
+    else:
+        raise NotImplementedError('Invalid ViT backbone not available')
+    return _make_vit_b16_backbone(
+        model,
+        features=features,
+        size=[224, 224],
+        hooks=hooks,
+        vit_features=vit_features,
+        start_index=2 if 'deit' in name else 1,
+    )
+def calc_dims(pretrained, is_vit=False):
+    dims = []
+    inp_res = 256
+    tmp = torch.zeros(1, 3, inp_res, inp_res)
+    if not is_vit:
+        tmp = pretrained.layer0(tmp)
+        dims.append(tmp.shape[1:3])
+        tmp = pretrained.layer1(tmp)
+        dims.append(tmp.shape[1:3])
+        tmp = pretrained.layer2(tmp)
+        dims.append(tmp.shape[1:3])
+        tmp = pretrained.layer3(tmp)
+        dims.append(tmp.shape[1:3])
+    else:
+        tmp = forward_vit(pretrained, tmp)
+        dims = [out.shape[1:3] for out in tmp]
+    # split to channels and resolution multiplier
+    dims = np.array(dims)
+    channels = dims[:, 0]
+    res_mult = dims[:, 1] / inp_res
+    return channels, res_mult
+def _make_pretrained(backbone, verbose=False):
+    assert backbone in ALL_MODELS
+    if backbone == 'vgg11_bn':
+        model = zoomodels.__dict__[backbone](True)
+        idcs = [7, 14, 21, 28]
+        pretrained = _feature_splitter(model, idcs)
+    elif backbone == 'vgg13_bn':
+        model = zoomodels.__dict__[backbone](True)
+        idcs = [13, 20, 27, 34]
+        pretrained = _feature_splitter(model, idcs)
+    elif backbone == 'vgg16_bn':
+        model = zoomodels.__dict__[backbone](True)
+        idcs = [13, 23, 33, 43]
+        pretrained = _feature_splitter(model, idcs)
+    elif backbone == 'vgg19_bn':
+        model = zoomodels.__dict__[backbone](True)
+        idcs = [13, 26, 39, 52]
+        pretrained = _feature_splitter(model, idcs)
+    elif backbone == 'densenet121':
+        model = zoomodels.__dict__[backbone](True)
+        pretrained = _make_densenet(model)
+    elif backbone == 'densenet169':
+        model = zoomodels.__dict__[backbone](True)
+        pretrained = _make_densenet(model)
+    elif backbone == 'densenet201':
+        model = zoomodels.__dict__[backbone](True)
+        pretrained = _make_densenet(model)
+    elif backbone == 'resnet18':
+        model = zoomodels.__dict__[backbone](True)
+        pretrained = _make_resnet(model)
+    elif backbone == 'resnet34':
+        model = zoomodels.__dict__[backbone](True)
+        pretrained = _make_resnet(model)
+    elif backbone == 'resnet50':
+        model = zoomodels.__dict__[backbone](True)
+        pretrained = _make_resnet(model)
+    elif backbone == 'resnet101':
+        model = zoomodels.__dict__[backbone](True)
+        pretrained = _make_resnet(model)
+    elif backbone == 'resnet152':
+        model = zoomodels.__dict__[backbone](True)
+        pretrained = _make_resnet(model)
+    elif backbone == 'wide_resnet50_2':
+        model = zoomodels.__dict__[backbone](True)
+        pretrained = _make_resnet(model)
+    elif backbone == 'wide_resnet101_2':
+        model = zoomodels.__dict__[backbone](True)
+        pretrained = _make_resnet(model)
+    elif backbone == 'shufflenet_v2_x0_5':
+        model = zoomodels.__dict__[backbone](True)
+        pretrained = _make_shufflenet(model)
+    elif backbone == 'mobilenet_v2':
+        model = zoomodels.__dict__[backbone](True)
+        idcs = [4, 7, 14, 18]
+        pretrained = _feature_splitter(model, idcs)  # same structure as vgg
+    elif backbone == 'mnasnet0_5':
+        model = zoomodels.__dict__[backbone](True)
+        model.features = model.layers
+        idcs = [9, 10, 12, 14]
+        pretrained = _feature_splitter(model, idcs)
+    elif backbone == 'mnasnet1_0':
+        model = zoomodels.__dict__[backbone](True)
+        model.features = model.layers
+        idcs = [9, 10, 12, 14]
+        pretrained = _feature_splitter(model, idcs)
+    elif backbone == 'ghostnet_100':
+        model = timm.create_model(backbone, pretrained=True)
+        pretrained = _make_ghostnet(model)
+    elif backbone == 'cspresnet50':
+        model = timm.create_model(backbone, pretrained=True)
+        pretrained = _make_cspresnet(model)
+    elif backbone == 'fbnetc_100':
+        model = timm.create_model(backbone, pretrained=True)
+        pretrained = _make_efficientnet(model)
+    elif backbone == 'spnasnet_100':
+        model = timm.create_model(backbone, pretrained=True)
+        pretrained = _make_efficientnet(model)
+    elif backbone == 'resnet50d':
+        model = timm.create_model(backbone, pretrained=True)
+        model.relu = model.act1
+        pretrained = _make_resnet(model)
+    elif backbone == 'resnet26':
+        model = timm.create_model(backbone, pretrained=True)
+        model.relu = model.act1
+        pretrained = _make_resnet(model)
+    elif backbone == 'resnet26d':
+        model = timm.create_model(backbone, pretrained=True)
+        model.relu = model.act1
+        pretrained = _make_resnet(model)
+    elif backbone == 'seresnet50':
+        model = timm.create_model(backbone, pretrained=True)
+        model.relu = model.act1
+        pretrained = _make_resnet(model)
+    elif backbone == 'resnetblur50':
+        model = timm.create_model(backbone, pretrained=True)
+        model.relu = model.act1
+        pretrained = _make_resnet(model)
+    elif backbone == 'resnetrs50':
+        model = timm.create_model(backbone, pretrained=True)
+        model.relu = model.act1
+        pretrained = _make_resnet(model)
+    elif backbone == 'tf_mixnet_s':
+        model = timm.create_model(backbone, pretrained=True)
+        pretrained = _make_efficientnet(model)
+    elif backbone == 'tf_mixnet_m':
+        model = timm.create_model(backbone, pretrained=True)
+        pretrained = _make_efficientnet(model)
+    elif backbone == 'tf_mixnet_l':
+        model = timm.create_model(backbone, pretrained=True)
+        pretrained = _make_efficientnet(model)
+    elif backbone == 'dm_nfnet_f0':
+        model = timm.create_model(backbone, pretrained=True)
+        pretrained = _make_cspresnet(model)
+    elif backbone == 'dm_nfnet_f1':
+        model = timm.create_model(backbone, pretrained=True)
+        pretrained = _make_cspresnet(model)
+    elif backbone == 'ese_vovnet19b_dw':
+        model = timm.create_model(backbone, pretrained=True)
+        pretrained = _make_cspresnet(model)
+    elif backbone == 'ese_vovnet39b':
+        model = timm.create_model(backbone, pretrained=True)
+        pretrained = _make_cspresnet(model)
+    elif backbone == 'res2next50':
+        model = timm.create_model(backbone, pretrained=True)
+        model.relu = model.act1
+        pretrained = _make_resnet(model)
+    elif backbone == 'gernet_s':
+        model = timm.create_model(backbone, pretrained=True)
+        pretrained = _make_cspresnet(model)
+    elif backbone == 'gernet_m':
+        model = timm.create_model(backbone, pretrained=True)
+        pretrained = _make_cspresnet(model)
+    elif backbone == 'repvgg_a2':
+        model = timm.create_model(backbone, pretrained=True)
+        pretrained = _make_cspresnet(model)
+    elif backbone == 'repvgg_b0':
+        model = timm.create_model(backbone, pretrained=True)
+        pretrained = _make_cspresnet(model)
+    elif backbone == 'repvgg_b1':
+        model = timm.create_model(backbone, pretrained=True)
+        pretrained = _make_cspresnet(model)
+    elif backbone == 'repvgg_b1g4':
+        model = timm.create_model(backbone, pretrained=True)
+        pretrained = _make_cspresnet(model)
+    elif backbone == 'dm_nfnet_f1':
+        model = timm.create_model(backbone, pretrained=True)
+        pretrained = _make_nfnet(model)
+    elif backbone == 'nfnet_l0':
+        model = timm.create_model(backbone, pretrained=True)
+        pretrained = _make_nfnet(model)
+    elif backbone in REGNETS:
+        model = timm.create_model(backbone, pretrained=True)
+        pretrained = _make_regnet(model)
+    elif backbone in EFFNETS:
+        model = timm.create_model(backbone, pretrained=True)
+        pretrained = _make_efficientnet(model)
+    elif backbone in VITS:
+        model = timm.create_model(backbone, pretrained=True)
+        pretrained = _make_vit(model, backbone)
+    elif backbone == 'resnet50_clip':
+        model = clip.load('RN50', device='cpu', jit=False)[0].visual
+        pretrained = _make_resnet_clip(model)
+    else:
+        raise NotImplementedError('Wrong model name?')
+    pretrained.CHANNELS, pretrained.RES_MULT = calc_dims(pretrained, is_vit=backbone in VITS)
+    if verbose:
+        print(f"Succesfully loaded:    {backbone}")
+        print(f"Channels:              {pretrained.CHANNELS}")
+        print(f"Resolution Multiplier: {pretrained.RES_MULT}")
+        print(f"Out Res for 256      : {pretrained.RES_MULT*256}")
+    return pretrained

feature_networks/vit.py ADDED Viewed

	@@ -0,0 +1,436 @@

+import torch
+import torch.nn as nn
+import timm
+import types
+import math
+import torch.nn.functional as F
+class Slice(nn.Module):
+    def __init__(self, start_index=1):
+        super(Slice, self).__init__()
+        self.start_index = start_index
+    def forward(self, x):
+        return x[:, self.start_index :]
+class AddReadout(nn.Module):
+    def __init__(self, start_index=1):
+        super(AddReadout, self).__init__()
+        self.start_index = start_index
+    def forward(self, x):
+        if self.start_index == 2:
+            readout = (x[:, 0] + x[:, 1]) / 2
+        else:
+            readout = x[:, 0]
+        return x[:, self.start_index :] + readout.unsqueeze(1)
+class ProjectReadout(nn.Module):
+    def __init__(self, in_features, start_index=1):
+        super(ProjectReadout, self).__init__()
+        self.start_index = start_index
+        self.project = nn.Sequential(nn.Linear(2 * in_features, in_features), nn.GELU())
+    def forward(self, x):
+        readout = x[:, 0].unsqueeze(1).expand_as(x[:, self.start_index :])
+        features = torch.cat((x[:, self.start_index :], readout), -1)
+        return self.project(features)
+class Transpose(nn.Module):
+    def __init__(self, dim0, dim1):
+        super(Transpose, self).__init__()
+        self.dim0 = dim0
+        self.dim1 = dim1
+    def forward(self, x):
+        x = x.transpose(self.dim0, self.dim1)
+        return x.contiguous()
+def forward_vit(pretrained, x):
+    b, c, h, w = x.shape
+    lantent,_ = pretrained.model.forward_flex(x)
+    layer_1 = pretrained.activations["1"]
+    layer_2 = pretrained.activations["2"]
+    layer_3 = pretrained.activations["3"]
+    layer_4 = pretrained.activations["4"]
+    layer_1 = pretrained.layer1[0:2](layer_1)
+    layer_2 = pretrained.layer2[0:2](layer_2)
+    layer_3 = pretrained.layer3[0:2](layer_3)
+    layer_4 = pretrained.layer4[0:2](layer_4)
+    unflatten = nn.Sequential(
+        nn.Unflatten(
+            2,
+            torch.Size(
+                [
+                    h // pretrained.model.patch_size[1],
+                    w // pretrained.model.patch_size[0],
+                ]
+            ),
+        )
+    )
+    if layer_1.ndim == 3:
+        layer_1 = unflatten(layer_1)
+    if layer_2.ndim == 3:
+        layer_2 = unflatten(layer_2)
+    if layer_3.ndim == 3:
+        layer_3 = unflatten(layer_3)
+    if layer_4.ndim == 3:
+        layer_4 = unflatten(layer_4)
+    layer_1 = pretrained.layer1[3 : len(pretrained.layer1)](layer_1)
+    layer_2 = pretrained.layer2[3 : len(pretrained.layer2)](layer_2)
+    layer_3 = pretrained.layer3[3 : len(pretrained.layer3)](layer_3)
+    layer_4 = pretrained.layer4[3 : len(pretrained.layer4)](layer_4)
+    return layer_1, layer_2, layer_3, layer_4
+def forward_swin(pretrained, x):
+    b, c, h, w = x.shape
+    lantent,_ = pretrained.model.forward_flex(x)
+    layer_1 = pretrained.activations["1"]
+    layer_2 = pretrained.activations["2"]
+    layer_3 = pretrained.activations["3"]
+    layer_4 = pretrained.activations["4"]
+    layer_1 = pretrained.layer1[0:2](layer_1)
+    layer_2 = pretrained.layer2[0:2](layer_2)
+    layer_3 = pretrained.layer3[0:2](layer_3)
+    layer_4 = pretrained.layer4[0:2](layer_4)
+    unflatten = nn.Sequential(
+        nn.Unflatten(
+            2,
+            torch.Size(
+                [
+                    h // pretrained.model.patch_size[1],
+                    w // pretrained.model.patch_size[0],
+                ]
+            ),
+        )
+    )
+    if layer_1.ndim == 3:
+        layer_1 = unflatten(layer_1)
+    if layer_2.ndim == 3:
+        layer_2 = unflatten(layer_2)
+    if layer_3.ndim == 3:
+        layer_3 = unflatten(layer_3)
+    if layer_4.ndim == 3:
+        layer_4 = unflatten(layer_4)
+    layer_1 = pretrained.layer1[3 : len(pretrained.layer1)](layer_1)
+    layer_2 = pretrained.layer2[3 : len(pretrained.layer2)](layer_2)
+    layer_3 = pretrained.layer3[3 : len(pretrained.layer3)](layer_3)
+    layer_4 = pretrained.layer4[3 : len(pretrained.layer4)](layer_4)
+    return layer_1, layer_2, layer_3, layer_4
+def _resize_pos_embed(self, posemb, gs_h, gs_w):
+    posemb_tok, posemb_grid = (
+        posemb[:, : self.start_index],
+        posemb[0, self.start_index :],
+    )
+    gs_old = int(math.sqrt(len(posemb_grid)))
+    posemb_grid = posemb_grid.reshape(1, gs_old, gs_old, -1).permute(0, 3, 1, 2)
+    posemb_grid = F.interpolate(posemb_grid, size=(gs_h, gs_w), mode="bilinear", align_corners=False)
+    posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape(1, gs_h * gs_w, -1)
+    posemb = torch.cat([posemb_tok, posemb_grid], dim=1)
+    return posemb
+def forward_flex(self, x):
+    b, c, h, w = x.shape
+    # print(x.shape, self.OOD2ID)
+    # x = self.OOD2ID(x)
+    pos_embed = self._resize_pos_embed(
+        self.pos_embed, h // self.patch_size[1], w // self.patch_size[0]
+    )
+    B = x.shape[0]
+    if hasattr(self.patch_embed, "backbone"):
+        x = self.patch_embed.backbone(x)
+        if isinstance(x, (list, tuple)):
+            x = x[-1]  # last feature if backbone outputs list/tuple of features
+    x = self.patch_embed.proj(x).flatten(2).transpose(1, 2)
+    if hasattr(self, "dist_token") and self.dist_token is not None:
+        cls_tokens = self.cls_token.expand(
+            B, -1, -1
+        )  # stole cls_tokens impl from Phil Wang, thanks
+        dist_token = self.dist_token.expand(B, -1, -1)
+        x = torch.cat((cls_tokens, dist_token, x), dim=1)
+    else:
+        cls_tokens = self.cls_token.expand(
+            B, -1, -1
+        )  # stole cls_tokens impl from Phil Wang, thanks
+        x = torch.cat((cls_tokens, x), dim=1)
+    x = x + pos_embed
+    x = self.pos_drop(x)
+    for blk in self.blocks:
+        x = blk(x)
+    x = self.norm(x)
+    return x, None
+def forward_flex_swin(self, x):
+    x = self.patch_embed(x)
+    if self.absolute_pos_embed is not None:
+        x = x + self.absolute_pos_embed
+    x = self.pos_drop(x)
+    x = self.layers(x)
+    x = self.norm(x)  # B L C
+    return x
+activations = {}
+def get_activation(name):
+    def hook(model, input, output):
+        activations[name] = output
+    return hook
+def get_readout_oper(vit_features, features, use_readout, start_index=1):
+    if use_readout == "ignore":
+        readout_oper = [Slice(start_index)] * len(features)
+    elif use_readout == "add":
+        readout_oper = [AddReadout(start_index)] * len(features)
+    elif use_readout == "project":
+        readout_oper = [
+            ProjectReadout(vit_features, start_index) for out_feat in features
+        ]
+    else:
+        assert (
+            False
+        ), "wrong operation for readout token, use_readout can be 'ignore', 'add', or 'project'"
+    return readout_oper
+def _make_vit_b16_backbone(
+    model,
+    features=[96, 192, 384, 768],
+    size=[384, 384],
+    hooks=[2, 5, 8, 11],
+    vit_features=768,
+    use_readout="ignore",
+    start_index=1,
+):
+    pretrained = nn.Module()
+    pretrained.model = model
+    pretrained.model.blocks[hooks[0]].register_forward_hook(get_activation("1"))
+    pretrained.model.blocks[hooks[1]].register_forward_hook(get_activation("2"))
+    pretrained.model.blocks[hooks[2]].register_forward_hook(get_activation("3"))
+    pretrained.model.blocks[hooks[3]].register_forward_hook(get_activation("4"))
+    pretrained.activations = activations
+    readout_oper = get_readout_oper(vit_features, features, use_readout, start_index)
+    # 32, 48, 136, 384
+    pretrained.layer1 = nn.Sequential(
+        readout_oper[0],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+        nn.Conv2d(
+            in_channels=vit_features,
+            out_channels=features[0],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+        nn.ConvTranspose2d(
+            in_channels=features[0],
+            out_channels=features[0],
+            kernel_size=4,
+            stride=4,
+            padding=0,
+            bias=True,
+            dilation=1,
+            groups=1,
+        ),
+    )
+    pretrained.layer2 = nn.Sequential(
+        readout_oper[1],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+        nn.Conv2d(
+            in_channels=vit_features,
+            out_channels=features[1],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+        nn.ConvTranspose2d(
+            in_channels=features[1],
+            out_channels=features[1],
+            kernel_size=2,
+            stride=2,
+            padding=0,
+            bias=True,
+            dilation=1,
+            groups=1,
+        ),
+    )
+    pretrained.layer3 = nn.Sequential(
+        readout_oper[2],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+        nn.Conv2d(
+            in_channels=vit_features,
+            out_channels=features[2],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+    )
+    pretrained.layer4 = nn.Sequential(
+        readout_oper[3],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+        nn.Conv2d(
+            in_channels=vit_features,
+            out_channels=features[3],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+        nn.Conv2d(
+            in_channels=features[3],
+            out_channels=features[3],
+            kernel_size=3,
+            stride=2,
+            padding=1,
+        ),
+    )
+    pretrained.model.start_index = start_index
+    pretrained.model.patch_size = [16, 16]
+    # We inject this function into the VisionTransformer instances so that
+    # we can use it with interpolated position embeddings without modifying the library source.
+    pretrained.model.forward_flex = types.MethodType(forward_flex, pretrained.model)
+    pretrained.model._resize_pos_embed = types.MethodType(
+        _resize_pos_embed, pretrained.model
+    )
+    return pretrained
+def _make_swin_b16_backbone(
+    model,
+    features=[96, 192, 384, 768],
+    size=[384, 384],
+    hooks=[2, 5, 8, 11],
+    vit_features=768,
+    use_readout="ignore",
+    start_index=1,
+):
+    pretrained = nn.Module()
+    pretrained.model = model
+    pretrained.model.blocks[hooks[0]].blocks[-1].register_forward_hook(get_activation("1"))
+    pretrained.model.blocks[hooks[1]].blocks[-1].register_forward_hook(get_activation("2"))
+    pretrained.model.blocks[hooks[2]].blocks[-1].register_forward_hook(get_activation("3"))
+    pretrained.model.blocks[hooks[3]].blocks[-1].register_forward_hook(get_activation("4"))
+    pretrained.activations = activations
+    readout_oper = get_readout_oper(vit_features, features, use_readout, start_index)
+    # 32, 48, 136, 384
+    pretrained.layer1 = nn.Sequential(
+        readout_oper[0],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // 4, size[1] // 4])),
+        nn.Conv2d(
+            in_channels=vit_features,
+            out_channels=features[0],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+    )
+    pretrained.layer2 = nn.Sequential(
+        readout_oper[1],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // 8, size[1] // 8])),
+        nn.Conv2d(
+            in_channels=vit_features*2,
+            out_channels=features[1],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+    )
+    pretrained.layer3 = nn.Sequential(
+        readout_oper[2],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+        nn.Conv2d(
+            in_channels=vit_features*4,
+            out_channels=features[2],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+    )
+    pretrained.layer4 = nn.Sequential(
+        readout_oper[3],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // 32, size[1] // 32])),
+        nn.Conv2d(
+            in_channels=vit_features*8,
+            out_channels=features[3],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+    )
+    pretrained.model.start_index = start_index
+    pretrained.model.patch_size = [16, 16]
+    # We inject this function into the VisionTransformer instances so that
+    # we can use it with interpolated position embeddings without modifying the library source.
+    pretrained.model.forward_flex = types.MethodType(forward_flex_swin, pretrained.model)
+    pretrained.model._resize_pos_embed = types.MethodType(
+        _resize_pos_embed, pretrained.model
+    )
+    return pretrained

legacy.py ADDED Viewed

	@@ -0,0 +1,331 @@

+# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+"""Converting legacy network pickle into the new format."""
+import click
+import pickle
+import re
+import copy
+import numpy as np
+import torch
+import io
+import dnnlib
+import misc
+#----------------------------------------------------------------------------
+def load_network_pkl(f, force_fp16=False):
+    data = _LegacyUnpickler(f).load()
+    # Legacy TensorFlow pickle => convert.
+    if isinstance(data, tuple) and len(data) == 3 and all(isinstance(net, _TFNetworkStub) for net in data):
+        tf_G, tf_D, tf_Gs = data
+        G = convert_tf_generator(tf_G)
+        D = convert_tf_discriminator(tf_D)
+        G_ema = convert_tf_generator(tf_Gs)
+        data = dict(G=G, D=D, G_ema=G_ema)
+    # Add missing fields.
+    if 'training_set_kwargs' not in data:
+        data['training_set_kwargs'] = None
+    if 'augment_pipe' not in data:
+        data['augment_pipe'] = None
+    # Validate contents.
+    assert isinstance(data['G'], torch.nn.Module)
+    assert isinstance(data['D'], torch.nn.Module)
+    assert isinstance(data['G_ema'], torch.nn.Module)
+    assert isinstance(data['training_set_kwargs'], (dict, type(None)))
+    assert isinstance(data['augment_pipe'], (torch.nn.Module, type(None)))
+    # Force FP16.
+    if force_fp16:
+        for key in ['G', 'D', 'G_ema']:
+            old = data[key]
+            kwargs = copy.deepcopy(old.init_kwargs)
+            fp16_kwargs = kwargs.get('synthesis_kwargs', kwargs)
+            fp16_kwargs.num_fp16_res = 4
+            fp16_kwargs.conv_clamp = 256
+            if kwargs != old.init_kwargs:
+                new = type(old)(**kwargs).eval().requires_grad_(False)
+                misc.copy_params_and_buffers(old, new, require_all=True)
+                data[key] = new
+    return data
+#----------------------------------------------------------------------------
+class _TFNetworkStub(dnnlib.EasyDict):
+    pass
+class _LegacyUnpickler(pickle.Unpickler):
+    def find_class(self, module, name):
+        # print(module,name)
+        if module == '__builtin__':
+            return
+        if module == 'dnnlib.tflib.network' and name == 'Network':
+            return _TFNetworkStub
+        if module == 'torch.storage' and name == '_load_from_bytes':
+            return lambda b: torch.load(io.BytesIO(b), map_location='cpu')
+        return super().find_class(module, name)
+#----------------------------------------------------------------------------
+def _collect_tf_params(tf_net):
+    # pylint: disable=protected-access
+    tf_params = dict()
+    def recurse(prefix, tf_net):
+        for name, value in tf_net.variables:
+            tf_params[prefix + name] = value
+        for name, comp in tf_net.components.items():
+            recurse(prefix + name + '/', comp)
+    recurse('', tf_net)
+    return tf_params
+#----------------------------------------------------------------------------
+def _populate_module_params(module, *patterns):
+    for name, tensor in misc.named_params_and_buffers(module):
+        found = False
+        value = None
+        for pattern, value_fn in zip(patterns[0::2], patterns[1::2]):
+            match = re.fullmatch(pattern, name)
+            if match:
+                found = True
+                if value_fn is not None:
+                    value = value_fn(*match.groups())
+                break
+        try:
+            assert found
+            if value is not None:
+                tensor.copy_(torch.from_numpy(np.array(value)))
+        except:
+            print(name, list(tensor.shape))
+            raise
+#----------------------------------------------------------------------------
+def convert_tf_generator(tf_G):
+    if tf_G.version < 4:
+        raise ValueError('TensorFlow pickle version too low')
+    # Collect kwargs.
+    tf_kwargs = tf_G.static_kwargs
+    known_kwargs = set()
+    def kwarg(tf_name, default=None, none=None):
+        known_kwargs.add(tf_name)
+        val = tf_kwargs.get(tf_name, default)
+        return val if val is not None else none
+    # Convert kwargs.
+    from pg_modules import networks_stylegan2
+    network_class = networks_stylegan2.Generator
+    kwargs = dnnlib.EasyDict(
+        z_dim               = kwarg('latent_size',          512),
+        c_dim               = kwarg('label_size',           0),
+        w_dim               = kwarg('dlatent_size',         512),
+        img_resolution      = kwarg('resolution',           1024),
+        img_channels        = kwarg('num_channels',         3),
+        channel_base        = kwarg('fmap_base',            16384) * 2,
+        channel_max         = kwarg('fmap_max',             512),
+        num_fp16_res        = kwarg('num_fp16_res',         0),
+        conv_clamp          = kwarg('conv_clamp',           None),
+        architecture        = kwarg('architecture',         'skip'),
+        resample_filter     = kwarg('resample_kernel',      [1,3,3,1]),
+        use_noise           = kwarg('use_noise',            True),
+        activation          = kwarg('nonlinearity',         'lrelu'),
+        mapping_kwargs      = dnnlib.EasyDict(
+            num_layers      = kwarg('mapping_layers',       8),
+            embed_features  = kwarg('label_fmaps',          None),
+            layer_features  = kwarg('mapping_fmaps',        None),
+            activation      = kwarg('mapping_nonlinearity', 'lrelu'),
+            lr_multiplier   = kwarg('mapping_lrmul',        0.01),
+            w_avg_beta      = kwarg('w_avg_beta',           0.995,  none=1),
+        ),
+    )
+    # Check for unknown kwargs.
+    kwarg('truncation_psi')
+    kwarg('truncation_cutoff')
+    kwarg('style_mixing_prob')
+    kwarg('structure')
+    kwarg('conditioning')
+    kwarg('fused_modconv')
+    unknown_kwargs = list(set(tf_kwargs.keys()) - known_kwargs)
+    if len(unknown_kwargs) > 0:
+        raise ValueError('Unknown TensorFlow kwarg', unknown_kwargs[0])
+    # Collect params.
+    tf_params = _collect_tf_params(tf_G)
+    for name, value in list(tf_params.items()):
+        match = re.fullmatch(r'ToRGB_lod(\d+)/(.*)', name)
+        if match:
+            r = kwargs.img_resolution // (2 ** int(match.group(1)))
+            tf_params[f'{r}x{r}/ToRGB/{match.group(2)}'] = value
+            kwargs.synthesis.kwargs.architecture = 'orig'
+    #for name, value in tf_params.items(): print(f'{name:<50s}{list(value.shape)}')
+    # Convert params.
+    G = network_class(**kwargs).eval().requires_grad_(False)
+    # pylint: disable=unnecessary-lambda
+    # pylint: disable=f-string-without-interpolation
+    _populate_module_params(G,
+        r'mapping\.w_avg',                                  lambda:     tf_params[f'dlatent_avg'],
+        r'mapping\.embed\.weight',                          lambda:     tf_params[f'mapping/LabelEmbed/weight'].transpose(),
+        r'mapping\.embed\.bias',                            lambda:     tf_params[f'mapping/LabelEmbed/bias'],
+        r'mapping\.fc(\d+)\.weight',                        lambda i:   tf_params[f'mapping/Dense{i}/weight'].transpose(),
+        r'mapping\.fc(\d+)\.bias',                          lambda i:   tf_params[f'mapping/Dense{i}/bias'],
+        r'synthesis\.b4\.const',                            lambda:     tf_params[f'synthesis/4x4/Const/const'][0],
+        r'synthesis\.b4\.conv1\.weight',                    lambda:     tf_params[f'synthesis/4x4/Conv/weight'].transpose(3, 2, 0, 1),
+        r'synthesis\.b4\.conv1\.bias',                      lambda:     tf_params[f'synthesis/4x4/Conv/bias'],
+        r'synthesis\.b4\.conv1\.noise_const',               lambda:     tf_params[f'synthesis/noise0'][0, 0],
+        r'synthesis\.b4\.conv1\.noise_strength',            lambda:     tf_params[f'synthesis/4x4/Conv/noise_strength'],
+        r'synthesis\.b4\.conv1\.affine\.weight',            lambda:     tf_params[f'synthesis/4x4/Conv/mod_weight'].transpose(),
+        r'synthesis\.b4\.conv1\.affine\.bias',              lambda:     tf_params[f'synthesis/4x4/Conv/mod_bias'] + 1,
+        r'synthesis\.b(\d+)\.conv0\.weight',                lambda r:   tf_params[f'synthesis/{r}x{r}/Conv0_up/weight'][::-1, ::-1].transpose(3, 2, 0, 1),
+        r'synthesis\.b(\d+)\.conv0\.bias',                  lambda r:   tf_params[f'synthesis/{r}x{r}/Conv0_up/bias'],
+        r'synthesis\.b(\d+)\.conv0\.noise_const',           lambda r:   tf_params[f'synthesis/noise{int(np.log2(int(r)))*2-5}'][0, 0],
+        r'synthesis\.b(\d+)\.conv0\.noise_strength',        lambda r:   tf_params[f'synthesis/{r}x{r}/Conv0_up/noise_strength'],
+        r'synthesis\.b(\d+)\.conv0\.affine\.weight',        lambda r:   tf_params[f'synthesis/{r}x{r}/Conv0_up/mod_weight'].transpose(),
+        r'synthesis\.b(\d+)\.conv0\.affine\.bias',          lambda r:   tf_params[f'synthesis/{r}x{r}/Conv0_up/mod_bias'] + 1,
+        r'synthesis\.b(\d+)\.conv1\.weight',                lambda r:   tf_params[f'synthesis/{r}x{r}/Conv1/weight'].transpose(3, 2, 0, 1),
+        r'synthesis\.b(\d+)\.conv1\.bias',                  lambda r:   tf_params[f'synthesis/{r}x{r}/Conv1/bias'],
+        r'synthesis\.b(\d+)\.conv1\.noise_const',           lambda r:   tf_params[f'synthesis/noise{int(np.log2(int(r)))*2-4}'][0, 0],
+        r'synthesis\.b(\d+)\.conv1\.noise_strength',        lambda r:   tf_params[f'synthesis/{r}x{r}/Conv1/noise_strength'],
+        r'synthesis\.b(\d+)\.conv1\.affine\.weight',        lambda r:   tf_params[f'synthesis/{r}x{r}/Conv1/mod_weight'].transpose(),
+        r'synthesis\.b(\d+)\.conv1\.affine\.bias',          lambda r:   tf_params[f'synthesis/{r}x{r}/Conv1/mod_bias'] + 1,
+        r'synthesis\.b(\d+)\.torgb\.weight',                lambda r:   tf_params[f'synthesis/{r}x{r}/ToRGB/weight'].transpose(3, 2, 0, 1),
+        r'synthesis\.b(\d+)\.torgb\.bias',                  lambda r:   tf_params[f'synthesis/{r}x{r}/ToRGB/bias'],
+        r'synthesis\.b(\d+)\.torgb\.affine\.weight',        lambda r:   tf_params[f'synthesis/{r}x{r}/ToRGB/mod_weight'].transpose(),
+        r'synthesis\.b(\d+)\.torgb\.affine\.bias',          lambda r:   tf_params[f'synthesis/{r}x{r}/ToRGB/mod_bias'] + 1,
+        r'synthesis\.b(\d+)\.skip\.weight',                 lambda r:   tf_params[f'synthesis/{r}x{r}/Skip/weight'][::-1, ::-1].transpose(3, 2, 0, 1),
+        r'.*\.resample_filter',                             None,
+        r'.*\.act_filter',                                  None,
+    )
+    return G
+#----------------------------------------------------------------------------
+def convert_tf_discriminator(tf_D):
+    if tf_D.version < 4:
+        raise ValueError('TensorFlow pickle version too low')
+    # Collect kwargs.
+    tf_kwargs = tf_D.static_kwargs
+    known_kwargs = set()
+    def kwarg(tf_name, default=None):
+        known_kwargs.add(tf_name)
+        return tf_kwargs.get(tf_name, default)
+    # Convert kwargs.
+    kwargs = dnnlib.EasyDict(
+        c_dim                   = kwarg('label_size',           0),
+        img_resolution          = kwarg('resolution',           1024),
+        img_channels            = kwarg('num_channels',         3),
+        architecture            = kwarg('architecture',         'resnet'),
+        channel_base            = kwarg('fmap_base',            16384) * 2,
+        channel_max             = kwarg('fmap_max',             512),
+        num_fp16_res            = kwarg('num_fp16_res',         0),
+        conv_clamp              = kwarg('conv_clamp',           None),
+        cmap_dim                = kwarg('mapping_fmaps',        None),
+        block_kwargs = dnnlib.EasyDict(
+            activation          = kwarg('nonlinearity',         'lrelu'),
+            resample_filter     = kwarg('resample_kernel',      [1,3,3,1]),
+            freeze_layers       = kwarg('freeze_layers',        0),
+        ),
+        mapping_kwargs = dnnlib.EasyDict(
+            num_layers          = kwarg('mapping_layers',       0),
+            embed_features      = kwarg('mapping_fmaps',        None),
+            layer_features      = kwarg('mapping_fmaps',        None),
+            activation          = kwarg('nonlinearity',         'lrelu'),
+            lr_multiplier       = kwarg('mapping_lrmul',        0.1),
+        ),
+        epilogue_kwargs = dnnlib.EasyDict(
+            mbstd_group_size    = kwarg('mbstd_group_size',     None),
+            mbstd_num_channels  = kwarg('mbstd_num_features',   1),
+            activation          = kwarg('nonlinearity',         'lrelu'),
+        ),
+    )
+    # Check for unknown kwargs.
+    kwarg('structure')
+    kwarg('conditioning')
+    unknown_kwargs = list(set(tf_kwargs.keys()) - known_kwargs)
+    if len(unknown_kwargs) > 0:
+        raise ValueError('Unknown TensorFlow kwarg', unknown_kwargs[0])
+    # Collect params.
+    tf_params = _collect_tf_params(tf_D)
+    for name, value in list(tf_params.items()):
+        match = re.fullmatch(r'FromRGB_lod(\d+)/(.*)', name)
+        if match:
+            r = kwargs.img_resolution // (2 ** int(match.group(1)))
+            tf_params[f'{r}x{r}/FromRGB/{match.group(2)}'] = value
+            kwargs.architecture = 'orig'
+    #for name, value in tf_params.items(): print(f'{name:<50s}{list(value.shape)}')
+    # Convert params.
+    #from pg_modules import networks_stylegan2
+    from pg_modules.discriminator import ProjectedDiscriminator
+    D = ProjectedDiscriminator(**kwargs).eval().requires_grad_(False)
+    # pylint: disable=unnecessary-lambda
+    # pylint: disable=f-string-without-interpolation
+    _populate_module_params(D,
+        r'b(\d+)\.fromrgb\.weight',     lambda r:       tf_params[f'{r}x{r}/FromRGB/weight'].transpose(3, 2, 0, 1),
+        r'b(\d+)\.fromrgb\.bias',       lambda r:       tf_params[f'{r}x{r}/FromRGB/bias'],
+        r'b(\d+)\.conv(\d+)\.weight',   lambda r, i:    tf_params[f'{r}x{r}/Conv{i}{["","_down"][int(i)]}/weight'].transpose(3, 2, 0, 1),
+        r'b(\d+)\.conv(\d+)\.bias',     lambda r, i:    tf_params[f'{r}x{r}/Conv{i}{["","_down"][int(i)]}/bias'],
+        r'b(\d+)\.skip\.weight',        lambda r:       tf_params[f'{r}x{r}/Skip/weight'].transpose(3, 2, 0, 1),
+        r'mapping\.embed\.weight',      lambda:         tf_params[f'LabelEmbed/weight'].transpose(),
+        r'mapping\.embed\.bias',        lambda:         tf_params[f'LabelEmbed/bias'],
+        r'mapping\.fc(\d+)\.weight',    lambda i:       tf_params[f'Mapping{i}/weight'].transpose(),
+        r'mapping\.fc(\d+)\.bias',      lambda i:       tf_params[f'Mapping{i}/bias'],
+        r'b4\.conv\.weight',            lambda:         tf_params[f'4x4/Conv/weight'].transpose(3, 2, 0, 1),
+        r'b4\.conv\.bias',              lambda:         tf_params[f'4x4/Conv/bias'],
+        r'b4\.fc\.weight',              lambda:         tf_params[f'4x4/Dense0/weight'].transpose(),
+        r'b4\.fc\.bias',                lambda:         tf_params[f'4x4/Dense0/bias'],
+        r'b4\.out\.weight',             lambda:         tf_params[f'Output/weight'].transpose(),
+        r'b4\.out\.bias',               lambda:         tf_params[f'Output/bias'],
+        r'.*\.resample_filter',         None,
+    )
+    return D
+#----------------------------------------------------------------------------
+@click.command()
+@click.option('--source', help='Input pickle', required=True, metavar='PATH')
+@click.option('--dest', help='Output pickle', required=True, metavar='PATH')
+@click.option('--force-fp16', help='Force the networks to use FP16', type=bool, default=False, metavar='BOOL', show_default=True)
+def convert_network_pickle(source, dest, force_fp16):
+    """Convert legacy network pickle into the native PyTorch format.
+    The tool is able to load the main network configurations exported using the TensorFlow version of StyleGAN2 or StyleGAN2-ADA.
+    It does not support e.g. StyleGAN2-ADA comparison methods, StyleGAN2 configs A-D, or StyleGAN1 networks.
+    Example:
+    \b
+    python legacy.py \\
+        --source=https://nvlabs-fi-cdn.nvidia.com/stylegan2/networks/stylegan2-cat-config-f.pkl \\
+        --dest=stylegan2-cat-config-f.pkl
+    """
+    print(f'Loading "{source}"...')
+    with dnnlib.util.open_url(source) as f:
+        data = load_network_pkl(f, force_fp16=force_fp16)
+    print(f'Saving "{dest}"...')
+    with open(dest, 'wb') as f:
+        pickle.dump(data, f)
+    print('Done.')
+#----------------------------------------------------------------------------
+if __name__ == "__main__":
+    convert_network_pickle() # pylint: disable=no-value-for-parameter
+#----------------------------------------------------------------------------

misc.py ADDED Viewed

	@@ -0,0 +1,275 @@

+# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+import re
+import contextlib
+import numpy as np
+import torch
+import warnings
+import dnnlib
+#----------------------------------------------------------------------------
+# Cached construction of constant tensors. Avoids CPU=>GPU copy when the
+# same constant is used multiple times.
+_constant_cache = dict()
+def constant(value, shape=None, dtype=None, device=None, memory_format=None):
+    value = np.asarray(value)
+    if shape is not None:
+        shape = tuple(shape)
+    if dtype is None:
+        dtype = torch.get_default_dtype()
+    if device is None:
+        device = torch.device('cpu')
+    if memory_format is None:
+        memory_format = torch.contiguous_format
+    key = (value.shape, value.dtype, value.tobytes(), shape, dtype, device, memory_format)
+    tensor = _constant_cache.get(key, None)
+    if tensor is None:
+        tensor = torch.as_tensor(value.copy(), dtype=dtype, device=device)
+        if shape is not None:
+            tensor, _ = torch.broadcast_tensors(tensor, torch.empty(shape))
+        tensor = tensor.contiguous(memory_format=memory_format)
+        _constant_cache[key] = tensor
+    return tensor
+#----------------------------------------------------------------------------
+# Replace NaN/Inf with specified numerical values.
+try:
+    nan_to_num = torch.nan_to_num # 1.8.0a0
+except AttributeError:
+    def nan_to_num(input, nan=0.0, posinf=None, neginf=None, *, out=None): # pylint: disable=redefined-builtin
+        assert isinstance(input, torch.Tensor)
+        if posinf is None:
+            posinf = torch.finfo(input.dtype).max
+        if neginf is None:
+            neginf = torch.finfo(input.dtype).min
+        assert nan == 0
+        return torch.clamp(input.unsqueeze(0).nansum(0), min=neginf, max=posinf, out=out)
+#----------------------------------------------------------------------------
+# Symbolic assert.
+try:
+    symbolic_assert = torch._assert # 1.8.0a0 # pylint: disable=protected-access
+except AttributeError:
+    symbolic_assert = torch.Assert # 1.7.0
+#----------------------------------------------------------------------------
+# Context manager to temporarily suppress known warnings in torch.jit.trace().
+# Note: Cannot use catch_warnings because of https://bugs.python.org/issue29672
+@contextlib.contextmanager
+def suppress_tracer_warnings():
+    flt = ('ignore', None, torch.jit.TracerWarning, None, 0)
+    warnings.filters.insert(0, flt)
+    yield
+    warnings.filters.remove(flt)
+#----------------------------------------------------------------------------
+# Assert that the shape of a tensor matches the given list of integers.
+# None indicates that the size of a dimension is allowed to vary.
+# Performs symbolic assertion when used in torch.jit.trace().
+def assert_shape(tensor, ref_shape):
+    if tensor.ndim != len(ref_shape):
+        raise AssertionError(f'Wrong number of dimensions: got {tensor.ndim}, expected {len(ref_shape)}')
+    for idx, (size, ref_size) in enumerate(zip(tensor.shape, ref_shape)):
+        if ref_size is None:
+            pass
+        elif isinstance(ref_size, torch.Tensor):
+            with suppress_tracer_warnings(): # as_tensor results are registered as constants
+                symbolic_assert(torch.equal(torch.as_tensor(size), ref_size), f'Wrong size for dimension {idx}')
+        elif isinstance(size, torch.Tensor):
+            with suppress_tracer_warnings(): # as_tensor results are registered as constants
+                symbolic_assert(torch.equal(size, torch.as_tensor(ref_size)), f'Wrong size for dimension {idx}: expected {ref_size}')
+        elif size != ref_size:
+            raise AssertionError(f'Wrong size for dimension {idx}: got {size}, expected {ref_size}')
+#----------------------------------------------------------------------------
+# Function decorator that calls torch.autograd.profiler.record_function().
+def profiled_function(fn):
+    def decorator(*args, **kwargs):
+        with torch.autograd.profiler.record_function(fn.__name__):
+            return fn(*args, **kwargs)
+    decorator.__name__ = fn.__name__
+    return decorator
+#----------------------------------------------------------------------------
+# Sampler for torch.utils.data.DataLoader that loops over the dataset
+# indefinitely, shuffling items as it goes.
+class InfiniteSampler(torch.utils.data.Sampler):
+    def __init__(self, dataset, rank=0, num_replicas=1, shuffle=True, seed=0, window_size=0.5):
+        assert len(dataset) > 0
+        assert num_replicas > 0
+        assert 0 <= rank < num_replicas
+        assert 0 <= window_size <= 1
+        super().__init__(dataset)
+        self.dataset = dataset
+        self.rank = rank
+        self.num_replicas = num_replicas
+        self.shuffle = shuffle
+        self.seed = seed
+        self.window_size = window_size
+    def __iter__(self):
+        order = np.arange(len(self.dataset))
+        rnd = None
+        window = 0
+        if self.shuffle:
+            rnd = np.random.RandomState(self.seed)
+            rnd.shuffle(order)
+            window = int(np.rint(order.size * self.window_size))
+        idx = 0
+        while True:
+            i = idx % order.size
+            if idx % self.num_replicas == self.rank:
+                yield order[i]
+            if window >= 2:
+                j = (i - rnd.randint(window)) % order.size
+                order[i], order[j] = order[j], order[i]
+            idx += 1
+#----------------------------------------------------------------------------
+# Utilities for operating with torch.nn.Module parameters and buffers.
+def params_and_buffers(module):
+    assert isinstance(module, torch.nn.Module)
+    return list(module.parameters()) + list(module.buffers())
+def named_params_and_buffers(module):
+    assert isinstance(module, torch.nn.Module)
+    return list(module.named_parameters()) + list(module.named_buffers())
+def copy_params_and_buffers(src_module, dst_module, require_all=False):
+    assert isinstance(src_module, torch.nn.Module)
+    assert isinstance(dst_module, torch.nn.Module)
+    src_tensors = dict(named_params_and_buffers(src_module))
+    for name, tensor in named_params_and_buffers(dst_module):
+        assert (name in src_tensors) or (not require_all)
+        if name in src_tensors:
+            try:
+                tensor.copy_(src_tensors[name].detach()).requires_grad_(tensor.requires_grad)
+            except:
+                continue
+#----------------------------------------------------------------------------
+# Context manager for easily enabling/disabling DistributedDataParallel
+# synchronization.
+@contextlib.contextmanager
+def ddp_sync(module, sync):
+    assert isinstance(module, torch.nn.Module)
+    if sync or not isinstance(module, torch.nn.parallel.DistributedDataParallel):
+        yield
+    else:
+        with module.no_sync():
+            yield
+#----------------------------------------------------------------------------
+# Check DistributedDataParallel consistency across processes.
+def check_ddp_consistency(module, ignore_regex=None):
+    assert isinstance(module, torch.nn.Module)
+    for name, tensor in named_params_and_buffers(module):
+        fullname = type(module).__name__ + '.' + name
+        if ignore_regex is not None and re.fullmatch(ignore_regex, fullname):
+            continue
+        tensor = tensor.detach()
+        if tensor.is_floating_point():
+            tensor = nan_to_num(tensor)
+        other = tensor.clone()
+        torch.distributed.broadcast(tensor=other, src=0)
+        assert (tensor == other).all(), fullname
+#----------------------------------------------------------------------------
+# Print summary table of module hierarchy.
+def print_module_summary(module, inputs, max_nesting=3, skip_redundant=True):
+    assert isinstance(module, torch.nn.Module)
+    assert not isinstance(module, torch.jit.ScriptModule)
+    assert isinstance(inputs, (tuple, list))
+    # Register hooks.
+    entries = []
+    nesting = [0]
+    def pre_hook(_mod, _inputs):
+        nesting[0] += 1
+    def post_hook(mod, _inputs, outputs):
+        nesting[0] -= 1
+        if nesting[0] <= max_nesting:
+            outputs = list(outputs) if isinstance(outputs, (tuple, list)) else [outputs]
+            outputs = [t for t in outputs if isinstance(t, torch.Tensor)]
+            entries.append(dnnlib.EasyDict(mod=mod, outputs=outputs))
+    hooks = [mod.register_forward_pre_hook(pre_hook) for mod in module.modules()]
+    hooks += [mod.register_forward_hook(post_hook) for mod in module.modules()]
+    # Run module.
+    outputs = module(*inputs)
+    for hook in hooks:
+        hook.remove()
+    # Identify unique outputs, parameters, and buffers.
+    tensors_seen = set()
+    for e in entries:
+        e.unique_params = [t for t in e.mod.parameters() if id(t) not in tensors_seen]
+        e.unique_buffers = [t for t in e.mod.buffers() if id(t) not in tensors_seen]
+        e.unique_outputs = [t for t in e.outputs if id(t) not in tensors_seen]
+        tensors_seen |= {id(t) for t in e.unique_params + e.unique_buffers + e.unique_outputs}
+    # Filter out redundant entries.
+    if skip_redundant:
+        entries = [e for e in entries if len(e.unique_params) or len(e.unique_buffers) or len(e.unique_outputs)]
+    # Construct table.
+    rows = [[type(module).__name__, 'Parameters', 'Buffers', 'Output shape', 'Datatype']]
+    rows += [['---'] * len(rows[0])]
+    param_total = 0
+    buffer_total = 0
+    submodule_names = {mod: name for name, mod in module.named_modules()}
+    for e in entries:
+        name = '<top-level>' if e.mod is module else submodule_names[e.mod]
+        param_size = sum(t.numel() for t in e.unique_params)
+        buffer_size = sum(t.numel() for t in e.unique_buffers)
+        output_shapes = [str(list(t.shape)) for t in e.outputs]
+        output_dtypes = [str(t.dtype).split('.')[-1] for t in e.outputs]
+        rows += [[
+            name + (':0' if len(e.outputs) >= 2 else ''),
+            str(param_size) if param_size else '-',
+            str(buffer_size) if buffer_size else '-',
+            (output_shapes + ['-'])[0],
+            (output_dtypes + ['-'])[0],
+        ]]
+        for idx in range(1, len(e.outputs)):
+            rows += [[name + f':{idx}', '-', '-', output_shapes[idx], output_dtypes[idx]]]
+        param_total += param_size
+        buffer_total += buffer_size
+    rows += [['---'] * len(rows[0])]
+    rows += [['Total', str(param_total), str(buffer_total), '-', '-']]
+    # Print table.
+    widths = [max(len(cell) for cell in column) for column in zip(*rows)]
+    print()
+    for row in rows:
+        print('  '.join(cell + ' ' * (width - len(cell)) for cell, width in zip(row, widths)))
+    print()
+    return outputs
+#----------------------------------------------------------------------------
+# Added by Katja
+import os
+def get_ckpt_path(run_dir):
+    return os.path.join(run_dir, f'network-snapshot.pkl')

pg_modules/__init__.py ADDED Viewed

File without changes

pg_modules/__pycache__/MViT.cpython-39.pyc ADDED Viewed

Binary file (17.9 kB). View file

pg_modules/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (222 Bytes). View file

pg_modules/__pycache__/blocks.cpython-38.pyc ADDED Viewed

Binary file (10.5 kB). View file

pg_modules/__pycache__/blocks.cpython-39.pyc ADDED Viewed

Binary file (11.8 kB). View file

pg_modules/__pycache__/diffaug.cpython-38.pyc ADDED Viewed

Binary file (2.69 kB). View file

pg_modules/__pycache__/diffaug.cpython-39.pyc ADDED Viewed

Binary file (2.81 kB). View file

pg_modules/__pycache__/discriminator.cpython-38.pyc ADDED Viewed

Binary file (5.65 kB). View file

pg_modules/__pycache__/discriminator.cpython-39.pyc ADDED Viewed

Binary file (4.51 kB). View file

pg_modules/__pycache__/mae.cpython-39.pyc ADDED Viewed

Binary file (8.85 kB). View file

pg_modules/__pycache__/models_tnt.cpython-39.pyc ADDED Viewed

Binary file (17.5 kB). View file

pg_modules/__pycache__/networks_fastgan.cpython-38.pyc ADDED Viewed

Binary file (5.2 kB). View file

pg_modules/__pycache__/networks_fastgan.cpython-39.pyc ADDED Viewed

Binary file (5.34 kB). View file

pg_modules/__pycache__/networks_stylegan2.cpython-39.pyc ADDED Viewed

Binary file (15.5 kB). View file

pg_modules/__pycache__/projector.cpython-38.pyc ADDED Viewed

Binary file (3.85 kB). View file

pg_modules/__pycache__/projector.cpython-39.pyc ADDED Viewed

Binary file (4.21 kB). View file

pg_modules/__pycache__/simmim.cpython-39.pyc ADDED Viewed

Binary file (4.22 kB). View file

pg_modules/__pycache__/vision_transformer.cpython-39.pyc ADDED Viewed

Binary file (11.9 kB). View file

pg_modules/blocks.py ADDED Viewed

	@@ -0,0 +1,370 @@

+import functools
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.nn.utils import spectral_norm
+### single layers
+def conv2d(*args, **kwargs):
+    return spectral_norm(nn.Conv2d(*args, **kwargs))
+def convTranspose2d(*args, **kwargs):
+    return spectral_norm(nn.ConvTranspose2d(*args, **kwargs))
+def embedding(*args, **kwargs):
+    return spectral_norm(nn.Embedding(*args, **kwargs))
+def linear(*args, **kwargs):
+    return spectral_norm(nn.Linear(*args, **kwargs))
+def NormLayer(c, mode='batch'):
+    if mode == 'group':
+        return nn.GroupNorm(c//2, c)
+    elif mode == 'batch':
+        return nn.BatchNorm2d(c)
+### Activations
+class GLU(nn.Module):
+    def forward(self, x):
+        nc = x.size(1)
+        assert nc % 2 == 0, 'channels dont divide 2!'
+        nc = int(nc/2)
+        return x[:, :nc] * torch.sigmoid(x[:, nc:])
+class Swish(nn.Module):
+    def forward(self, feat):
+        return feat * torch.sigmoid(feat)
+### Upblocks
+class InitLayer(nn.Module):
+    def __init__(self, nz, channel, sz=4):
+        super().__init__()
+        self.init = nn.Sequential(
+            convTranspose2d(nz, channel*2, sz, 1, 0, bias=False),
+            NormLayer(channel*2),
+            GLU(),
+        )
+    def forward(self, noise):
+        noise = noise.view(noise.shape[0], -1, 1, 1)
+        return self.init(noise)
+def UpBlockSmall(in_planes, out_planes):
+    block = nn.Sequential(
+        nn.Upsample(scale_factor=2, mode='nearest'),
+        conv2d(in_planes, out_planes*2, 3, 1, 1, bias=False),
+        NormLayer(out_planes*2), GLU())
+    return block
+class UpBlockSmallCond(nn.Module):
+    def __init__(self, in_planes, out_planes, z_dim):
+        super().__init__()
+        self.in_planes = in_planes
+        self.out_planes = out_planes
+        self.up = nn.Upsample(scale_factor=2, mode='nearest')
+        self.conv = conv2d(in_planes, out_planes*2, 3, 1, 1, bias=False)
+        which_bn = functools.partial(CCBN, which_linear=linear, input_size=z_dim)
+        self.bn = which_bn(2*out_planes)
+        self.act = GLU()
+    def forward(self, x, c):
+        x = self.up(x)
+        x = self.conv(x)
+        x = self.bn(x, c)
+        x = self.act(x)
+        return x
+def UpBlockBig(in_planes, out_planes):
+    block = nn.Sequential(
+        nn.Upsample(scale_factor=2, mode='nearest'),
+        conv2d(in_planes, out_planes*2, 3, 1, 1, bias=False),
+        NoiseInjection(),
+        NormLayer(out_planes*2), GLU(),
+        conv2d(out_planes, out_planes*2, 3, 1, 1, bias=False),
+        NoiseInjection(),
+        NormLayer(out_planes*2), GLU()
+        )
+    return block
+class UpBlockBigCond(nn.Module):
+    def __init__(self, in_planes, out_planes, z_dim):
+        super().__init__()
+        self.in_planes = in_planes
+        self.out_planes = out_planes
+        self.up = nn.Upsample(scale_factor=2, mode='nearest')
+        self.conv1 = conv2d(in_planes, out_planes*2, 3, 1, 1, bias=False)
+        self.conv2 = conv2d(out_planes, out_planes*2, 3, 1, 1, bias=False)
+        which_bn = functools.partial(CCBN, which_linear=linear, input_size=z_dim)
+        self.bn1 = which_bn(2*out_planes)
+        self.bn2 = which_bn(2*out_planes)
+        self.act = GLU()
+        self.noise = NoiseInjection()
+    def forward(self, x, c):
+        # block 1
+        x = self.up(x)
+        x = self.conv1(x)
+        x = self.noise(x)
+        x = self.bn1(x, c)
+        x = self.act(x)
+        # block 2
+        x = self.conv2(x)
+        x = self.noise(x)
+        x = self.bn2(x, c)
+        x = self.act(x)
+        return x
+class SEBlock(nn.Module):
+    def __init__(self, ch_in, ch_out):
+        super().__init__()
+        self.main = nn.Sequential(
+            nn.AdaptiveAvgPool2d(4),
+            conv2d(ch_in, ch_out, 4, 1, 0, bias=False),
+            Swish(),
+            conv2d(ch_out, ch_out, 1, 1, 0, bias=False),
+            nn.Sigmoid(),
+        )
+    def forward(self, feat_small, feat_big):
+        return feat_big * self.main(feat_small)
+### Downblocks
+class SeparableConv2d(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, bias=False):
+        super(SeparableConv2d, self).__init__()
+        self.depthwise = conv2d(in_channels, in_channels, kernel_size=kernel_size,
+            groups=in_channels, bias=bias, padding=1)
+        self.pointwise = conv2d(in_channels, out_channels,
+            kernel_size=1, bias=bias)
+    def forward(self, x):
+        out = self.depthwise(x)
+        out = self.pointwise(out)
+        return out
+class DownBlock(nn.Module):
+    def __init__(self, in_planes, out_planes, separable=False):
+        super().__init__()
+        if not separable:
+            self.main = nn.Sequential(
+                conv2d(in_planes, out_planes, 4, 2, 1),
+                NormLayer(out_planes),
+                nn.LeakyReLU(0.2, inplace=True),
+            )
+        else:
+            self.main = nn.Sequential(
+                SeparableConv2d(in_planes, out_planes, 3),
+                NormLayer(out_planes),
+                nn.LeakyReLU(0.2, inplace=True),
+                nn.AvgPool2d(2, 2),
+            )
+    def forward(self, feat):
+        return self.main(feat)
+class DownBlockPatch(nn.Module):
+    def __init__(self, in_planes, out_planes, separable=False):
+        super().__init__()
+        self.main = nn.Sequential(
+            DownBlock(in_planes, out_planes, separable),
+            conv2d(out_planes, out_planes, 1, 1, 0, bias=False),
+            NormLayer(out_planes),
+            nn.LeakyReLU(0.2, inplace=True),
+        )
+    def forward(self, feat):
+        return self.main(feat)
+### CSM
+class ResidualConvUnit(nn.Module):
+    def __init__(self, cin, activation, bn):
+        super().__init__()
+        self.conv = nn.Conv2d(cin, cin, kernel_size=3, stride=1, padding=1, bias=True)
+        self.skip_add = nn.quantized.FloatFunctional()
+    def forward(self, x):
+        return self.skip_add.add(self.conv(x), x)
+class FeatureFusionBlock(nn.Module):
+    def __init__(self, features, activation, deconv=False, bn=False, expand=False, align_corners=True, lowest=False):
+        super().__init__()
+        self.deconv = deconv
+        self.align_corners = align_corners
+        self.expand = expand
+        out_features = features
+        if self.expand==True:
+            out_features = features//2
+        self.out_conv = nn.Conv2d(features, out_features, kernel_size=1, stride=1, padding=0, bias=True, groups=1)
+        self.skip_add = nn.quantized.FloatFunctional()
+    def forward(self, *xs):
+        output = xs[0]
+        if len(xs) == 2:
+            output = self.skip_add.add(output, xs[1])
+        output = nn.functional.interpolate(
+            output, scale_factor=2, mode="bilinear", align_corners=self.align_corners
+        )
+        output = self.out_conv(output)
+        return output
+class FeatureFusionBlock_V2(nn.Module):
+    def __init__(self, features, activation, deconv=False, bn=False, expand=False, align_corners=True, lowest=False):
+        super().__init__()
+        self.deconv = deconv
+        self.align_corners = align_corners
+        self.expand = expand
+        out_features = features
+        if self.expand==True:
+            out_features = features
+        self.out_conv = nn.Conv2d(features, out_features, kernel_size=1, stride=1, padding=0, bias=True, groups=1)
+        self.skip_add = nn.quantized.FloatFunctional()
+    def forward(self, *xs):
+        output = xs[0]
+        if len(xs) == 2:
+            output = self.skip_add.add(output, xs[1])
+        # output = nn.functional.interpolate(
+        #     output, scale_factor=2, mode="bilinear", align_corners=self.align_corners
+        # )
+        output = self.out_conv(output)
+        return output
+from timm.models.vision_transformer import PatchEmbed, Block
+class FeatureFusionBlockTrans(nn.Module):
+    def __init__(self, features):
+        super().__init__()
+        self.out_conv = Block(features,num_heads=12)
+        self.skip_add = nn.quantized.FloatFunctional()
+    def forward(self, *xs):
+        output = xs[0]
+        if len(xs) == 2:
+            output = self.skip_add.add(output, xs[1])
+        output = self.out_conv(output)
+        return output
+### Misc
+class NoiseInjection(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.weight = nn.Parameter(torch.zeros(1), requires_grad=True)
+    def forward(self, feat, noise=None):
+        if noise is None:
+            batch, _, height, width = feat.shape
+            noise = torch.randn(batch, 1, height, width).to(feat.device)
+        return feat + self.weight * noise
+class CCBN(nn.Module):
+    ''' conditional batchnorm '''
+    def __init__(self, output_size, input_size, which_linear, eps=1e-5, momentum=0.1):
+        super().__init__()
+        self.output_size, self.input_size = output_size, input_size
+        # Prepare gain and bias layers
+        self.gain = which_linear(input_size, output_size)
+        self.bias = which_linear(input_size, output_size)
+        # epsilon to avoid dividing by 0
+        self.eps = eps
+        # Momentum
+        self.momentum = momentum
+        self.register_buffer('stored_mean', torch.zeros(output_size))
+        self.register_buffer('stored_var', torch.ones(output_size))
+    def forward(self, x, y):
+        # Calculate class-conditional gains and biases
+        gain = (1 + self.gain(y)).view(y.size(0), -1, 1, 1)
+        bias = self.bias(y).view(y.size(0), -1, 1, 1)
+        out = F.batch_norm(x, self.stored_mean, self.stored_var, None, None,
+                           self.training, 0.1, self.eps)
+        return out * gain + bias
+class Interpolate(nn.Module):
+    """Interpolation module."""
+    def __init__(self, size, mode='bilinear', align_corners=False):
+        """Init.
+        Args:
+            scale_factor (float): scaling
+            mode (str): interpolation mode
+        """
+        super(Interpolate, self).__init__()
+        self.interp = nn.functional.interpolate
+        self.size = size
+        self.mode = mode
+        self.align_corners = align_corners
+    def forward(self, x):
+        """Forward pass.
+        Args:
+            x (tensor): input
+        Returns:
+            tensor: interpolated data
+        """
+        x = self.interp(
+            x,
+            size=self.size,
+            mode=self.mode,
+            align_corners=self.align_corners,
+        )
+        return x

pg_modules/diffaug.py ADDED Viewed

	@@ -0,0 +1,76 @@

+# Differentiable Augmentation for Data-Efficient GAN Training
+# Shengyu Zhao, Zhijian Liu, Ji Lin, Jun-Yan Zhu, and Song Han
+# https://arxiv.org/pdf/2006.10738
+import torch
+import torch.nn.functional as F
+def DiffAugment(x, policy='', channels_first=True):
+    if policy:
+        if not channels_first:
+            x = x.permute(0, 3, 1, 2)
+        for p in policy.split(','):
+            for f in AUGMENT_FNS[p]:
+                x = f(x)
+        if not channels_first:
+            x = x.permute(0, 2, 3, 1)
+        x = x.contiguous()
+    return x
+def rand_brightness(x):
+    x = x + (torch.rand(x.size(0), 1, 1, 1, dtype=x.dtype, device=x.device) - 0.5)
+    return x
+def rand_saturation(x):
+    x_mean = x.mean(dim=1, keepdim=True)
+    x = (x - x_mean) * (torch.rand(x.size(0), 1, 1, 1, dtype=x.dtype, device=x.device) * 2) + x_mean
+    return x
+def rand_contrast(x):
+    x_mean = x.mean(dim=[1, 2, 3], keepdim=True)
+    x = (x - x_mean) * (torch.rand(x.size(0), 1, 1, 1, dtype=x.dtype, device=x.device) + 0.5) + x_mean
+    return x
+def rand_translation(x, ratio=0.125):
+    shift_x, shift_y = int(x.size(2) * ratio + 0.5), int(x.size(3) * ratio + 0.5)
+    translation_x = torch.randint(-shift_x, shift_x + 1, size=[x.size(0), 1, 1], device=x.device)
+    translation_y = torch.randint(-shift_y, shift_y + 1, size=[x.size(0), 1, 1], device=x.device)
+    grid_batch, grid_x, grid_y = torch.meshgrid(
+        torch.arange(x.size(0), dtype=torch.long, device=x.device),
+        torch.arange(x.size(2), dtype=torch.long, device=x.device),
+        torch.arange(x.size(3), dtype=torch.long, device=x.device),
+    )
+    grid_x = torch.clamp(grid_x + translation_x + 1, 0, x.size(2) + 1)
+    grid_y = torch.clamp(grid_y + translation_y + 1, 0, x.size(3) + 1)
+    x_pad = F.pad(x, [1, 1, 1, 1, 0, 0, 0, 0])
+    x = x_pad.permute(0, 2, 3, 1).contiguous()[grid_batch, grid_x, grid_y].permute(0, 3, 1, 2)
+    return x
+def rand_cutout(x, ratio=0.2):
+    cutout_size = int(x.size(2) * ratio + 0.5), int(x.size(3) * ratio + 0.5)
+    offset_x = torch.randint(0, x.size(2) + (1 - cutout_size[0] % 2), size=[x.size(0), 1, 1], device=x.device)
+    offset_y = torch.randint(0, x.size(3) + (1 - cutout_size[1] % 2), size=[x.size(0), 1, 1], device=x.device)
+    grid_batch, grid_x, grid_y = torch.meshgrid(
+        torch.arange(x.size(0), dtype=torch.long, device=x.device),
+        torch.arange(cutout_size[0], dtype=torch.long, device=x.device),
+        torch.arange(cutout_size[1], dtype=torch.long, device=x.device),
+    )
+    grid_x = torch.clamp(grid_x + offset_x - cutout_size[0] // 2, min=0, max=x.size(2) - 1)
+    grid_y = torch.clamp(grid_y + offset_y - cutout_size[1] // 2, min=0, max=x.size(3) - 1)
+    mask = torch.ones(x.size(0), x.size(2), x.size(3), dtype=x.dtype, device=x.device)
+    mask[grid_batch, grid_x, grid_y] = 0
+    x = x * mask.unsqueeze(1)
+    return x
+AUGMENT_FNS = {
+    'color': [rand_brightness, rand_saturation, rand_contrast],
+    'translation': [rand_translation],
+    'cutout': [rand_cutout],
+}

pg_modules/discriminator.py ADDED Viewed

	@@ -0,0 +1,153 @@

+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchvision.transforms import Normalize
+import pickle
+from pg_modules.diffaug import DiffAugment
+from pg_modules.blocks import conv2d, DownBlock, DownBlockPatch
+from pg_modules.projector import F_RandomProj
+from feature_networks.constants import VITS
+class SingleDisc(nn.Module):
+    def __init__(self, nc=None, ndf=None, start_sz=256, end_sz=8, head=None, patch=False):
+        super().__init__()
+        # midas channels
+        nfc_midas = {4: 512, 8: 512, 16: 256, 32: 128, 64: 64, 128: 64,
+                     256: 32, 512: 16, 1024: 8}
+        # interpolate for start sz that are not powers of two
+        if start_sz not in nfc_midas.keys():
+            sizes = np.array(list(nfc_midas.keys()))
+            start_sz = sizes[np.argmin(abs(sizes - start_sz))]
+        self.start_sz = start_sz
+        # if given ndf, allocate all layers with the same ndf
+        if ndf is None:
+            nfc = nfc_midas
+        else:
+            nfc = {k: ndf for k, v in nfc_midas.items()}
+        # for feature map discriminators with nfc not in nfc_midas
+        # this is the case for the pretrained backbone (midas.pretrained)
+        if nc is not None and head is None:
+            nfc[start_sz] = nc
+        layers = []
+        # Head if the initial input is the full modality
+        if head:
+            layers += [conv2d(nc, nfc[256], 3, 1, 1, bias=False),
+                       nn.LeakyReLU(0.2, inplace=True)]
+        # Down Blocks
+        DB = DownBlockPatch if patch else DownBlock
+        while start_sz > end_sz:
+            layers.append(DB(nfc[start_sz], nfc[start_sz//2]))
+            start_sz = start_sz // 2
+        layers.append(conv2d(nfc[end_sz], 1, 4, 1, 0, bias=False))
+        self.main = nn.Sequential(*layers)
+    def forward(self, x, c):
+        return self.main(x)
+class MultiScaleD(nn.Module):
+    def __init__(
+        self,
+        channels,
+        resolutions,
+        num_discs=4,
+        proj_type=2,  # 0 = no projection, 1 = cross channel mixing, 2 = cross scale mixing
+        cond=0,
+        patch=False,
+        **kwargs,
+    ):
+        super().__init__()
+        assert num_discs in [1, 2, 3, 4, 5]
+        # the first disc is on the lowest level of the backbone
+        self.disc_in_channels = channels[:num_discs]
+        self.disc_in_res = resolutions[:num_discs]
+        Disc = SingleDisc
+        mini_discs = []
+        for i, (cin, res) in enumerate(zip(self.disc_in_channels, self.disc_in_res)):
+            start_sz = res if not patch else 16
+            mini_discs += [str(i), Disc(nc=cin, start_sz=start_sz, end_sz=8, patch=patch)],
+        self.mini_discs = nn.ModuleDict(mini_discs)
+    def forward(self, features, c, rec=False):
+        all_logits = []
+        for k, disc in self.mini_discs.items():
+            all_logits.append(disc(features[k], c).view(features[k].size(0), -1))
+        all_logits = torch.cat(all_logits, dim=1)
+        return all_logits
+class ProjectedDiscriminator(torch.nn.Module):
+    def __init__(
+        self,
+        backbones,
+        diffaug=True,
+        interp224=True,
+        backbone_kwargs={},
+        **kwargs
+    ):
+        super().__init__()
+        self.backbones = backbones
+        self.diffaug = diffaug
+        self.interp224 = interp224
+        # get backbones and multi-scale discs
+        feature_networks, discriminators = [], []
+        for i, bb_name in enumerate(backbones):
+            feat = F_RandomProj(bb_name, **backbone_kwargs)
+            disc = MultiScaleD(
+                channels=feat.CHANNELS,
+                resolutions=feat.RESOLUTIONS,
+                **backbone_kwargs,
+            )
+            feature_networks.append([bb_name, feat])
+            discriminators.append([bb_name, disc])
+        self.feature_networks = nn.ModuleDict(feature_networks)
+        self.discriminators = nn.ModuleDict(discriminators)
+    def train(self, mode=True):
+        self.feature_networks = self.feature_networks.train(False)
+        self.discriminators = self.discriminators.train(mode)
+        return self
+    def eval(self):
+        return self.train(False)
+    def forward(self, x, c):
+        logits = []
+        for bb_name, feat in self.feature_networks.items():
+            # apply augmentation (x in [-1, 1])
+            x_aug = DiffAugment(x, policy='color,translation,cutout') if self.diffaug else x
+            # transform to [0,1]
+            x_aug = x_aug.add(1).div(2)
+            # apply F-specific normalization
+            x_n = Normalize(feat.normstats['mean'], feat.normstats['std'])(x_aug)
+            # upsample if smaller, downsample if larger + VIT
+            if self.interp224 or bb_name in VITS:
+                x_n = F.interpolate(x_n, 224, mode='bilinear', align_corners=False)
+            # forward pass
+            features = feat(x_n)
+            logits += self.discriminators[bb_name](features, c)
+        return logits

pg_modules/networks_fastgan.py ADDED Viewed

	@@ -0,0 +1,180 @@

+# original implementation: https://github.com/odegeasslbc/FastGAN-pytorch/blob/main/models.py
+#
+# modified by Axel Sauer for "Projected GANs Converge Faster"
+#
+import torch.nn as nn
+from pg_modules.blocks import (InitLayer, UpBlockBig, UpBlockBigCond, UpBlockSmall, UpBlockSmallCond, SEBlock, conv2d)
+import torch
+def normalize_second_moment(x, dim=1, eps=1e-8):
+    return x * (x.square().mean(dim=dim, keepdim=True) + eps).rsqrt()
+class DummyMapping(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def forward(self, z, c, **kwargs):
+        return z.unsqueeze(1)  # to fit the StyleGAN API
+class FastganSynthesis(nn.Module):
+    def __init__(self, ngf=128, z_dim=256, nc=3, img_resolution=256, lite=False):
+        super().__init__()
+        self.img_resolution = img_resolution
+        self.z_dim = z_dim
+        # channel multiplier
+        nfc_multi = {2: 16, 4:16, 8:8, 16:4, 32:2, 64:2, 128:1, 224:0.5, 256:0.5,
+                     512:0.25, 1024:0.125}
+        nfc = {}
+        for k, v in nfc_multi.items():
+            nfc[k] = int(v*ngf)
+        # layers
+        self.init = InitLayer(z_dim, channel=nfc[2], sz=4)
+        UpBlock = UpBlockSmall if lite else UpBlockBig
+        self.feat_8   = UpBlock(nfc[4], nfc[8])
+        self.feat_16  = UpBlock(nfc[8], nfc[16])
+        self.feat_32  = UpBlock(nfc[16], nfc[32])
+        self.feat_64  = UpBlock(nfc[32], nfc[64])
+        self.feat_128 = UpBlock(nfc[64], nfc[128])
+        self.feat_256 = UpBlock(nfc[128], nfc[256])
+        self.se_64  = SEBlock(nfc[4], nfc[64])
+        self.se_128 = SEBlock(nfc[8], nfc[128])
+        self.se_256 = SEBlock(nfc[16], nfc[256])
+        self.to_big = conv2d(nfc[img_resolution], nc, 3, 1, 1, bias=True)
+        if img_resolution > 256:
+            self.feat_512 = UpBlock(nfc[256], nfc[512])
+            self.se_512 = SEBlock(nfc[32], nfc[512])
+        if img_resolution > 512:
+            self.feat_1024 = UpBlock(nfc[512], nfc[1024])
+    def forward(self, input, c, **kwargs):
+        # map noise to hypersphere as in "Progressive Growing of GANS"
+        input = normalize_second_moment(input[:, 0])
+        feat_4 = self.init(input)
+        feat_8 = self.feat_8(feat_4)
+        feat_16 = self.feat_16(feat_8)
+        feat_32 = self.feat_32(feat_16)
+        feat_64 = self.se_64(feat_4, self.feat_64(feat_32))
+        feat_128 = self.se_128(feat_8,  self.feat_128(feat_64))\
+        if self.img_resolution >= 64:
+            feat_last = feat_64
+        if self.img_resolution >= 128:
+            feat_last = feat_128
+        if self.img_resolution >= 224:
+            feat_last = self.se_256(feat_16, self.feat_256(feat_last))
+        if self.img_resolution >= 512:
+            feat_last = self.se_512(feat_32, self.feat_512(feat_last))
+        if self.img_resolution >= 1024:
+            feat_last = self.feat_1024(feat_last)
+        return torch.tanh(self.to_big(feat_last))
+class FastganSynthesisCond(nn.Module):
+    def __init__(self, ngf=64, z_dim=256, nc=3, img_resolution=256, num_classes=1000, lite=False):
+        super().__init__()
+        self.z_dim = z_dim
+        nfc_multi = {2: 16, 4:16, 8:8, 16:4, 32:2, 64:2, 128:1, 256:0.5,
+                     512:0.25, 1024:0.125, 2048:0.125}
+        nfc = {}
+        for k, v in nfc_multi.items():
+            nfc[k] = int(v*ngf)
+        self.img_resolution = img_resolution
+        self.init = InitLayer(z_dim, channel=nfc[2], sz=4)
+        UpBlock = UpBlockSmallCond if lite else UpBlockBigCond
+        self.feat_8   = UpBlock(nfc[4], nfc[8], z_dim)
+        self.feat_16  = UpBlock(nfc[8], nfc[16], z_dim)
+        self.feat_32  = UpBlock(nfc[16], nfc[32], z_dim)
+        self.feat_64  = UpBlock(nfc[32], nfc[64], z_dim)
+        self.feat_128 = UpBlock(nfc[64], nfc[128], z_dim)
+        self.feat_256 = UpBlock(nfc[128], nfc[256], z_dim)
+        self.se_64 = SEBlock(nfc[4], nfc[64])
+        self.se_128 = SEBlock(nfc[8], nfc[128])
+        self.se_256 = SEBlock(nfc[16], nfc[256])
+        self.to_big = conv2d(nfc[img_resolution], nc, 3, 1, 1, bias=True)
+        if img_resolution > 256:
+            self.feat_512 = UpBlock(nfc[256], nfc[512])
+            self.se_512 = SEBlock(nfc[32], nfc[512])
+        if img_resolution > 512:
+            self.feat_1024 = UpBlock(nfc[512], nfc[1024])
+        self.embed = nn.Embedding(num_classes, z_dim)
+    def forward(self, input, c, update_emas=False):
+        c = self.embed(c.argmax(1))
+        # map noise to hypersphere as in "Progressive Growing of GANS"
+        input = normalize_second_moment(input[:, 0])
+        feat_4 = self.init(input)
+        feat_8 = self.feat_8(feat_4, c)
+        feat_16 = self.feat_16(feat_8, c)
+        feat_32 = self.feat_32(feat_16, c)
+        feat_64 = self.se_64(feat_4, self.feat_64(feat_32, c))
+        feat_128 = self.se_128(feat_8,  self.feat_128(feat_64, c))
+        if self.img_resolution >= 128:
+            feat_last = feat_128
+        if self.img_resolution >= 256:
+            feat_last = self.se_256(feat_16, self.feat_256(feat_last, c))
+        if self.img_resolution >= 512:
+            feat_last = self.se_512(feat_32, self.feat_512(feat_last, c))
+        if self.img_resolution >= 1024:
+            feat_last = self.feat_1024(feat_last, c)
+        return self.to_big(feat_last)
+class Generator(nn.Module):
+    def __init__(
+        self,
+        z_dim=256,
+        c_dim=0,
+        w_dim=0,
+        img_resolution=256,
+        img_channels=3,
+        ngf=128,
+        cond=0,
+        mapping_kwargs={},
+        synthesis_kwargs={}
+    ):
+        super().__init__()
+        self.z_dim = z_dim
+        self.c_dim = c_dim
+        self.w_dim = w_dim
+        self.img_resolution = img_resolution
+        self.img_channels = img_channels
+        # Mapping and Synthesis Networks
+        self.mapping = DummyMapping()  # to fit the StyleGAN API
+        Synthesis = FastganSynthesisCond if cond else FastganSynthesis
+        self.synthesis = Synthesis(ngf=ngf, z_dim=z_dim, nc=img_channels, img_resolution=img_resolution, **synthesis_kwargs)
+    def forward(self, z, c, **kwargs):
+        w = self.mapping(z, c)
+        img = self.synthesis(w, c)
+        return img

pg_modules/networks_stylegan2.py ADDED Viewed

	@@ -0,0 +1,537 @@

+# Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+#
+# modified by Axel Sauer for "Projected GANs Converge Faster"
+#
+import numpy as np
+import torch
+from torch_utils import misc
+from torch_utils import persistence
+from torch_utils.ops import conv2d_resample
+from torch_utils.ops import upfirdn2d
+from torch_utils.ops import bias_act
+from torch_utils.ops import fma
+@misc.profiled_function
+def normalize_2nd_moment(x, dim=1, eps=1e-8):
+    return x * (x.square().mean(dim=dim, keepdim=True) + eps).rsqrt()
+@misc.profiled_function
+def modulated_conv2d(
+    x,                          # Input tensor of shape [batch_size, in_channels, in_height, in_width].
+    weight,                     # Weight tensor of shape [out_channels, in_channels, kernel_height, kernel_width].
+    styles,                     # Modulation coefficients of shape [batch_size, in_channels].
+    noise           = None,     # Optional noise tensor to add to the output activations.
+    up              = 1,        # Integer upsampling factor.
+    down            = 1,        # Integer downsampling factor.
+    padding         = 0,        # Padding with respect to the upsampled image.
+    resample_filter = None,     # Low-pass filter to apply when resampling activations. Must be prepared beforehand by calling upfirdn2d.setup_filter().
+    demodulate      = True,     # Apply weight demodulation?
+    flip_weight     = True,     # False = convolution, True = correlation (matches torch.nn.functional.conv2d).
+    fused_modconv   = True,     # Perform modulation, convolution, and demodulation as a single fused operation?
+):
+    batch_size = x.shape[0]
+    out_channels, in_channels, kh, kw = weight.shape
+    misc.assert_shape(weight, [out_channels, in_channels, kh, kw]) # [OIkk]
+    misc.assert_shape(x, [batch_size, in_channels, None, None]) # [NIHW]
+    misc.assert_shape(styles, [batch_size, in_channels]) # [NI]
+    # Pre-normalize inputs to avoid FP16 overflow.
+    if x.dtype == torch.float16 and demodulate:
+        weight = weight * (1 / np.sqrt(in_channels * kh * kw) / weight.norm(float('inf'), dim=[1,2,3], keepdim=True)) # max_Ikk
+        styles = styles / styles.norm(float('inf'), dim=1, keepdim=True) # max_I
+    # Calculate per-sample weights and demodulation coefficients.
+    w = None
+    dcoefs = None
+    if demodulate or fused_modconv:
+        w = weight.unsqueeze(0) # [NOIkk]
+        w = w * styles.reshape(batch_size, 1, -1, 1, 1) # [NOIkk]
+    if demodulate:
+        dcoefs = (w.square().sum(dim=[2,3,4]) + 1e-8).rsqrt() # [NO]
+    if demodulate and fused_modconv:
+        w = w * dcoefs.reshape(batch_size, -1, 1, 1, 1) # [NOIkk]
+    # Execute by scaling the activations before and after the convolution.
+    if not fused_modconv:
+        x = x * styles.to(x.dtype).reshape(batch_size, -1, 1, 1)
+        x = conv2d_resample.conv2d_resample(x=x, w=weight.to(x.dtype), f=resample_filter, up=up, down=down, padding=padding, flip_weight=flip_weight)
+        if demodulate and noise is not None:
+            x = fma.fma(x, dcoefs.to(x.dtype).reshape(batch_size, -1, 1, 1), noise.to(x.dtype))
+        elif demodulate:
+            x = x * dcoefs.to(x.dtype).reshape(batch_size, -1, 1, 1)
+        elif noise is not None:
+            x = x.add_(noise.to(x.dtype))
+        return x
+    # Execute as one fused op using grouped convolution.
+    with misc.suppress_tracer_warnings(): # this value will be treated as a constant
+        batch_size = int(batch_size)
+    misc.assert_shape(x, [batch_size, in_channels, None, None])
+    x = x.reshape(1, -1, *x.shape[2:])
+    w = w.reshape(-1, in_channels, kh, kw)
+    x = conv2d_resample.conv2d_resample(x=x, w=w.to(x.dtype), f=resample_filter, up=up, down=down, padding=padding, groups=batch_size, flip_weight=flip_weight)
+    x = x.reshape(batch_size, -1, *x.shape[2:])
+    if noise is not None:
+        x = x.add_(noise)
+    return x
+@persistence.persistent_class
+class FullyConnectedLayer(torch.nn.Module):
+    def __init__(self,
+        in_features,                # Number of input features.
+        out_features,               # Number of output features.
+        bias            = True,     # Apply additive bias before the activation function?
+        activation      = 'linear', # Activation function: 'relu', 'lrelu', etc.
+        lr_multiplier   = 1,        # Learning rate multiplier.
+        bias_init       = 0,        # Initial value for the additive bias.
+    ):
+        super().__init__()
+        self.in_features = in_features
+        self.out_features = out_features
+        self.activation = activation
+        self.weight = torch.nn.Parameter(torch.randn([out_features, in_features]) / lr_multiplier)
+        self.bias = torch.nn.Parameter(torch.full([out_features], np.float32(bias_init))) if bias else None
+        self.weight_gain = lr_multiplier / np.sqrt(in_features)
+        self.bias_gain = lr_multiplier
+    def forward(self, x):
+        w = self.weight.to(x.dtype) * self.weight_gain
+        b = self.bias
+        if b is not None:
+            b = b.to(x.dtype)
+            if self.bias_gain != 1:
+                b = b * self.bias_gain
+        if self.activation == 'linear' and b is not None:
+            x = torch.addmm(b.unsqueeze(0), x, w.t())
+        else:
+            x = x.matmul(w.t())
+            x = bias_act.bias_act(x, b, act=self.activation)
+        return x
+    def extra_repr(self):
+        return f'in_features={self.in_features:d}, out_features={self.out_features:d}, activation={self.activation:s}'
+@persistence.persistent_class
+class Conv2dLayer(torch.nn.Module):
+    def __init__(self,
+        in_channels,                    # Number of input channels.
+        out_channels,                   # Number of output channels.
+        kernel_size,                    # Width and height of the convolution kernel.
+        bias            = True,         # Apply additive bias before the activation function?
+        activation      = 'linear',     # Activation function: 'relu', 'lrelu', etc.
+        up              = 1,            # Integer upsampling factor.
+        down            = 1,            # Integer downsampling factor.
+        resample_filter = [1,3,3,1],    # Low-pass filter to apply when resampling activations.
+        conv_clamp      = None,         # Clamp the output to +-X, None = disable clamping.
+        channels_last   = False,        # Expect the input to have memory_format=channels_last?
+        trainable       = True,         # Update the weights of this layer during training?
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.activation = activation
+        self.up = up
+        self.down = down
+        self.conv_clamp = conv_clamp
+        self.register_buffer('resample_filter', upfirdn2d.setup_filter(resample_filter))
+        self.padding = kernel_size // 2
+        self.weight_gain = 1 / np.sqrt(in_channels * (kernel_size ** 2))
+        self.act_gain = bias_act.activation_funcs[activation].def_gain
+        memory_format = torch.channels_last if channels_last else torch.contiguous_format
+        weight = torch.randn([out_channels, in_channels, kernel_size, kernel_size]).to(memory_format=memory_format)
+        bias = torch.zeros([out_channels]) if bias else None
+        if trainable:
+            self.weight = torch.nn.Parameter(weight)
+            self.bias = torch.nn.Parameter(bias) if bias is not None else None
+        else:
+            self.register_buffer('weight', weight)
+            if bias is not None:
+                self.register_buffer('bias', bias)
+            else:
+                self.bias = None
+    def forward(self, x, gain=1):
+        w = self.weight * self.weight_gain
+        b = self.bias.to(x.dtype) if self.bias is not None else None
+        flip_weight = (self.up == 1) # slightly faster
+        x = conv2d_resample.conv2d_resample(x=x, w=w.to(x.dtype), f=self.resample_filter, up=self.up, down=self.down, padding=self.padding, flip_weight=flip_weight)
+        act_gain = self.act_gain * gain
+        act_clamp = self.conv_clamp * gain if self.conv_clamp is not None else None
+        x = bias_act.bias_act(x, b, act=self.activation, gain=act_gain, clamp=act_clamp)
+        return x
+    def extra_repr(self):
+        return ' '.join([
+            f'in_channels={self.in_channels:d}, out_channels={self.out_channels:d}, activation={self.activation:s},',
+            f'up={self.up}, down={self.down}'])
+@persistence.persistent_class
+class MappingNetwork(torch.nn.Module):
+    def __init__(self,
+        z_dim,                      # Input latent (Z) dimensionality, 0 = no latent.
+        c_dim,                      # Conditioning label (C) dimensionality, 0 = no label.
+        w_dim,                      # Intermediate latent (W) dimensionality.
+        num_ws,                     # Number of intermediate latents to output, None = do not broadcast.
+        num_layers      = 8,        # Number of mapping layers.
+        embed_features  = None,     # Label embedding dimensionality, None = same as w_dim.
+        layer_features  = None,     # Number of intermediate features in the mapping layers, None = same as w_dim.
+        activation      = 'lrelu',  # Activation function: 'relu', 'lrelu', etc.
+        lr_multiplier   = 0.01,     # Learning rate multiplier for the mapping layers.
+        w_avg_beta      = 0.998,    # Decay for tracking the moving average of W during training, None = do not track.
+    ):
+        super().__init__()
+        self.z_dim = z_dim
+        self.c_dim = c_dim
+        self.w_dim = w_dim
+        self.num_ws = num_ws
+        self.num_layers = num_layers
+        self.w_avg_beta = w_avg_beta
+        if embed_features is None:
+            embed_features = w_dim
+        if c_dim == 0:
+            embed_features = 0
+        if layer_features is None:
+            layer_features = w_dim
+        features_list = [z_dim + embed_features] + [layer_features] * (num_layers - 1) + [w_dim]
+        if c_dim > 0:
+            self.embed = FullyConnectedLayer(c_dim, embed_features)
+        for idx in range(num_layers):
+            in_features = features_list[idx]
+            out_features = features_list[idx + 1]
+            layer = FullyConnectedLayer(in_features, out_features, activation=activation, lr_multiplier=lr_multiplier)
+            setattr(self, f'fc{idx}', layer)
+        if num_ws is not None and w_avg_beta is not None:
+            self.register_buffer('w_avg', torch.zeros([w_dim]))
+    def forward(self, z, c, truncation_psi=1, truncation_cutoff=None, update_emas=False):
+        # Embed, normalize, and concat inputs.
+        x = None
+        with torch.autograd.profiler.record_function('input'):
+            if self.z_dim > 0:
+                misc.assert_shape(z, [None, self.z_dim])
+                x = normalize_2nd_moment(z.to(torch.float32))
+            if self.c_dim > 0:
+                misc.assert_shape(c, [None, self.c_dim])
+                y = normalize_2nd_moment(self.embed(c.to(torch.float32)))
+                x = torch.cat([x, y], dim=1) if x is not None else y
+        # Main layers.
+        for idx in range(self.num_layers):
+            layer = getattr(self, f'fc{idx}')
+            x = layer(x)
+        # Update moving average of W.
+        if update_emas and self.w_avg_beta is not None:
+            with torch.autograd.profiler.record_function('update_w_avg'):
+                self.w_avg.copy_(x.detach().mean(dim=0).lerp(self.w_avg, self.w_avg_beta))
+        # Broadcast.
+        if self.num_ws is not None:
+            with torch.autograd.profiler.record_function('broadcast'):
+                x = x.unsqueeze(1).repeat([1, self.num_ws, 1])
+        # Apply truncation.
+        if truncation_psi != 1:
+            with torch.autograd.profiler.record_function('truncate'):
+                assert self.w_avg_beta is not None
+                if self.num_ws is None or truncation_cutoff is None:
+                    x = self.w_avg.lerp(x, truncation_psi)
+                else:
+                    x[:, :truncation_cutoff] = self.w_avg.lerp(x[:, :truncation_cutoff], truncation_psi)
+        return x
+    def extra_repr(self):
+        return f'z_dim={self.z_dim:d}, c_dim={self.c_dim:d}, w_dim={self.w_dim:d}, num_ws={self.num_ws:d}'
+@persistence.persistent_class
+class SynthesisLayer(torch.nn.Module):
+    def __init__(self,
+        in_channels,                    # Number of input channels.
+        out_channels,                   # Number of output channels.
+        w_dim,                          # Intermediate latent (W) dimensionality.
+        resolution,                     # Resolution of this layer.
+        kernel_size     = 3,            # Convolution kernel size.
+        up              = 1,            # Integer upsampling factor.
+        use_noise       = True,         # Enable noise input?
+        activation      = 'lrelu',      # Activation function: 'relu', 'lrelu', etc.
+        resample_filter = [1,3,3,1],    # Low-pass filter to apply when resampling activations.
+        conv_clamp      = None,         # Clamp the output of convolution layers to +-X, None = disable clamping.
+        channels_last   = False,        # Use channels_last format for the weights?
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.w_dim = w_dim
+        self.resolution = resolution
+        self.up = up
+        self.use_noise = use_noise
+        self.activation = activation
+        self.conv_clamp = conv_clamp
+        self.register_buffer('resample_filter', upfirdn2d.setup_filter(resample_filter))
+        self.padding = kernel_size // 2
+        self.act_gain = bias_act.activation_funcs[activation].def_gain
+        self.affine = FullyConnectedLayer(w_dim, in_channels, bias_init=1)
+        memory_format = torch.channels_last if channels_last else torch.contiguous_format
+        self.weight = torch.nn.Parameter(torch.randn([out_channels, in_channels, kernel_size, kernel_size]).to(memory_format=memory_format))
+        if use_noise:
+            self.register_buffer('noise_const', torch.randn([resolution, resolution]))
+            self.noise_strength = torch.nn.Parameter(torch.zeros([]))
+        self.bias = torch.nn.Parameter(torch.zeros([out_channels]))
+    def forward(self, x, w, noise_mode='random', fused_modconv=True, gain=1):
+        assert noise_mode in ['random', 'const', 'none']
+        in_resolution = self.resolution // self.up
+        misc.assert_shape(x, [None, self.in_channels, in_resolution, in_resolution])
+        styles = self.affine(w)
+        noise = None
+        if self.use_noise and noise_mode == 'random':
+            noise = torch.randn([x.shape[0], 1, self.resolution, self.resolution], device=x.device) * self.noise_strength
+        if self.use_noise and noise_mode == 'const':
+            noise = self.noise_const * self.noise_strength
+        flip_weight = (self.up == 1) # slightly faster
+        x = modulated_conv2d(x=x, weight=self.weight, styles=styles, noise=noise, up=self.up,
+            padding=self.padding, resample_filter=self.resample_filter, flip_weight=flip_weight, fused_modconv=fused_modconv)
+        act_gain = self.act_gain * gain
+        act_clamp = self.conv_clamp * gain if self.conv_clamp is not None else None
+        x = bias_act.bias_act(x, self.bias.to(x.dtype), act=self.activation, gain=act_gain, clamp=act_clamp)
+        return x
+    def extra_repr(self):
+        return ' '.join([
+            f'in_channels={self.in_channels:d}, out_channels={self.out_channels:d}, w_dim={self.w_dim:d},',
+            f'resolution={self.resolution:d}, up={self.up}, activation={self.activation:s}'])
+@persistence.persistent_class
+class ToRGBLayer(torch.nn.Module):
+    def __init__(self, in_channels, out_channels, w_dim, kernel_size=1, conv_clamp=None, channels_last=False):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.w_dim = w_dim
+        self.conv_clamp = conv_clamp
+        self.affine = FullyConnectedLayer(w_dim, in_channels, bias_init=1)
+        memory_format = torch.channels_last if channels_last else torch.contiguous_format
+        self.weight = torch.nn.Parameter(torch.randn([out_channels, in_channels, kernel_size, kernel_size]).to(memory_format=memory_format))
+        self.bias = torch.nn.Parameter(torch.zeros([out_channels]))
+        self.weight_gain = 1 / np.sqrt(in_channels * (kernel_size ** 2))
+    def forward(self, x, w, fused_modconv=True):
+        styles = self.affine(w) * self.weight_gain
+        x = modulated_conv2d(x=x, weight=self.weight, styles=styles, demodulate=False, fused_modconv=fused_modconv)
+        x = bias_act.bias_act(x, self.bias.to(x.dtype), clamp=self.conv_clamp)
+        return x
+    def extra_repr(self):
+        return f'in_channels={self.in_channels:d}, out_channels={self.out_channels:d}, w_dim={self.w_dim:d}'
+@persistence.persistent_class
+class SynthesisBlock(torch.nn.Module):
+    def __init__(self,
+        in_channels,                            # Number of input channels, 0 = first block.
+        out_channels,                           # Number of output channels.
+        w_dim,                                  # Intermediate latent (W) dimensionality.
+        resolution,                             # Resolution of this block.
+        img_channels,                           # Number of output color channels.
+        is_last,                                # Is this the last block?
+        architecture            = 'skip',       # Architecture: 'orig', 'skip', 'resnet'.
+        resample_filter         = [1,3,3,1],    # Low-pass filter to apply when resampling activations.
+        conv_clamp              = 256,          # Clamp the output of convolution layers to +-X, None = disable clamping.
+        use_fp16                = False,        # Use FP16 for this block?
+        fp16_channels_last      = False,        # Use channels-last memory format with FP16?
+        fused_modconv_default   = True,         # Default value of fused_modconv. 'inference_only' = True for inference, False for training.
+        **layer_kwargs,                         # Arguments for SynthesisLayer.
+    ):
+        assert architecture in ['orig', 'skip', 'resnet']
+        super().__init__()
+        self.in_channels = in_channels
+        self.w_dim = w_dim
+        self.resolution = resolution
+        self.img_channels = img_channels
+        self.is_last = is_last
+        self.architecture = architecture
+        self.use_fp16 = use_fp16
+        self.channels_last = (use_fp16 and fp16_channels_last)
+        self.fused_modconv_default = fused_modconv_default
+        self.register_buffer('resample_filter', upfirdn2d.setup_filter(resample_filter))
+        self.num_conv = 0
+        self.num_torgb = 0
+        if in_channels == 0:
+            self.const = torch.nn.Parameter(torch.randn([out_channels, resolution, resolution]))
+        if in_channels != 0:
+            self.conv0 = SynthesisLayer(in_channels, out_channels, w_dim=w_dim, resolution=resolution, up=2,
+                resample_filter=resample_filter, conv_clamp=conv_clamp, channels_last=self.channels_last, **layer_kwargs)
+            self.num_conv += 1
+        self.conv1 = SynthesisLayer(out_channels, out_channels, w_dim=w_dim, resolution=resolution,
+            conv_clamp=conv_clamp, channels_last=self.channels_last, **layer_kwargs)
+        self.num_conv += 1
+        if is_last or architecture == 'skip':
+            self.torgb = ToRGBLayer(out_channels, img_channels, w_dim=w_dim,
+                conv_clamp=conv_clamp, channels_last=self.channels_last)
+            self.num_torgb += 1
+        if in_channels != 0 and architecture == 'resnet':
+            self.skip = Conv2dLayer(in_channels, out_channels, kernel_size=1, bias=False, up=2,
+                resample_filter=resample_filter, channels_last=self.channels_last)
+    def forward(self, x, img, ws, force_fp32=False, fused_modconv=None, update_emas=False, **layer_kwargs):
+        _ = update_emas # unused
+        misc.assert_shape(ws, [None, self.num_conv + self.num_torgb, self.w_dim])
+        w_iter = iter(ws.unbind(dim=1))
+        if ws.device.type != 'cuda':
+            force_fp32 = True
+        dtype = torch.float16 if self.use_fp16 and not force_fp32 else torch.float32
+        memory_format = torch.channels_last if self.channels_last and not force_fp32 else torch.contiguous_format
+        if fused_modconv is None:
+            fused_modconv = self.fused_modconv_default
+        if fused_modconv == 'inference_only':
+            fused_modconv = (not self.training)
+        # Input.
+        if self.in_channels == 0:
+            x = self.const.to(dtype=dtype, memory_format=memory_format)
+            x = x.unsqueeze(0).repeat([ws.shape[0], 1, 1, 1])
+        else:
+            misc.assert_shape(x, [None, self.in_channels, self.resolution // 2, self.resolution // 2])
+            x = x.to(dtype=dtype, memory_format=memory_format)
+        # Main layers.
+        if self.in_channels == 0:
+            x = self.conv1(x, next(w_iter), fused_modconv=fused_modconv, **layer_kwargs)
+        elif self.architecture == 'resnet':
+            y = self.skip(x, gain=np.sqrt(0.5))
+            x = self.conv0(x, next(w_iter), fused_modconv=fused_modconv, **layer_kwargs)
+            x = self.conv1(x, next(w_iter), fused_modconv=fused_modconv, gain=np.sqrt(0.5), **layer_kwargs)
+            x = y.add_(x)
+        else:
+            x = self.conv0(x, next(w_iter), fused_modconv=fused_modconv, **layer_kwargs)
+            x = self.conv1(x, next(w_iter), fused_modconv=fused_modconv, **layer_kwargs)
+        # ToRGB.
+        if img is not None:
+            misc.assert_shape(img, [None, self.img_channels, self.resolution // 2, self.resolution // 2])
+            img = upfirdn2d.upsample2d(img, self.resample_filter)
+        if self.is_last or self.architecture == 'skip':
+            y = self.torgb(x, next(w_iter), fused_modconv=fused_modconv)
+            y = y.to(dtype=torch.float32, memory_format=torch.contiguous_format)
+            img = img.add_(y) if img is not None else y
+        assert x.dtype == dtype
+        assert img is None or img.dtype == torch.float32
+        return x, img
+    def extra_repr(self):
+        return f'resolution={self.resolution:d}, architecture={self.architecture:s}'
+@persistence.persistent_class
+class SynthesisNetwork(torch.nn.Module):
+    def __init__(self,
+        w_dim,                      # Intermediate latent (W) dimensionality.
+        img_resolution,             # Output image resolution.
+        img_channels,               # Number of color channels.
+        channel_base    = 32768,    # Overall multiplier for the number of channels.
+        channel_max     = 512,      # Maximum number of channels in any layer.
+        num_fp16_res    = 4,        # Use FP16 for the N highest resolutions.
+        **block_kwargs,             # Arguments for SynthesisBlock.
+    ):
+        assert img_resolution >= 4 and img_resolution & (img_resolution - 1) == 0
+        super().__init__()
+        self.w_dim = w_dim
+        self.img_resolution = img_resolution
+        self.img_resolution_log2 = int(np.log2(img_resolution))
+        self.img_channels = img_channels
+        self.num_fp16_res = num_fp16_res
+        self.block_resolutions = [2 ** i for i in range(2, self.img_resolution_log2 + 1)]
+        channels_dict = {res: min(channel_base // res, channel_max) for res in self.block_resolutions}
+        fp16_resolution = max(2 ** (self.img_resolution_log2 + 1 - num_fp16_res), 8)
+        self.num_ws = 0
+        for res in self.block_resolutions:
+            in_channels = channels_dict[res // 2] if res > 4 else 0
+            out_channels = channels_dict[res]
+            use_fp16 = (res >= fp16_resolution)
+            is_last = (res == self.img_resolution)
+            block = SynthesisBlock(in_channels, out_channels, w_dim=w_dim, resolution=res,
+                img_channels=img_channels, is_last=is_last, use_fp16=use_fp16, **block_kwargs)
+            self.num_ws += block.num_conv
+            if is_last:
+                self.num_ws += block.num_torgb
+            setattr(self, f'b{res}', block)
+    def forward(self, ws, c=None, **block_kwargs):
+        block_ws = []
+        with torch.autograd.profiler.record_function('split_ws'):
+            misc.assert_shape(ws, [None, self.num_ws, self.w_dim])
+            ws = ws.to(torch.float32)
+            w_idx = 0
+            for res in self.block_resolutions:
+                block = getattr(self, f'b{res}')
+                block_ws.append(ws.narrow(1, w_idx, block.num_conv + block.num_torgb))
+                w_idx += block.num_conv
+        x = img = None
+        for res, cur_ws in zip(self.block_resolutions, block_ws):
+            block = getattr(self, f'b{res}')
+            x, img = block(x, img, cur_ws, **block_kwargs)
+        return img
+    def extra_repr(self):
+        return ' '.join([
+            f'w_dim={self.w_dim:d}, num_ws={self.num_ws:d},',
+            f'img_resolution={self.img_resolution:d}, img_channels={self.img_channels:d},',
+            f'num_fp16_res={self.num_fp16_res:d}'])
+@persistence.persistent_class
+class Generator(torch.nn.Module):
+    def __init__(self,
+        z_dim,                      # Input latent (Z) dimensionality.
+        c_dim,                      # Conditioning label (C) dimensionality.
+        w_dim,                      # Intermediate latent (W) dimensionality.
+        img_resolution,             # Output resolution.
+        img_channels,               # Number of output color channels.
+        mapping_kwargs      = {},   # Arguments for MappingNetwork.
+        **synthesis_kwargs,         # Arguments for SynthesisNetwork.
+    ):
+        super().__init__()
+        self.z_dim = z_dim
+        self.c_dim = c_dim
+        self.w_dim = w_dim
+        self.img_resolution = img_resolution
+        self.img_channels = img_channels
+        self.synthesis = SynthesisNetwork(w_dim=w_dim, img_resolution=img_resolution, img_channels=img_channels, **synthesis_kwargs)
+        self.num_ws = self.synthesis.num_ws
+        self.mapping = MappingNetwork(z_dim=z_dim, c_dim=c_dim, w_dim=w_dim, num_ws=self.num_ws, **mapping_kwargs)
+    def forward(self, z, c, truncation_psi=1, truncation_cutoff=None, update_emas=False, **synthesis_kwargs):
+        ws = self.mapping(z, c, truncation_psi=truncation_psi, truncation_cutoff=truncation_cutoff, update_emas=update_emas)
+        img = self.synthesis(ws, update_emas=update_emas, **synthesis_kwargs)
+        return img

pg_modules/projector.py ADDED Viewed

	@@ -0,0 +1,158 @@

+import torch
+import torch.nn as nn
+from feature_networks.vit import forward_vit
+from feature_networks.pretrained_builder import _make_pretrained
+from feature_networks.constants import NORMALIZED_INCEPTION, NORMALIZED_IMAGENET, NORMALIZED_CLIP, VITS
+from pg_modules.blocks import FeatureFusionBlock
+def get_backbone_normstats(backbone):
+    if backbone in NORMALIZED_INCEPTION:
+        return {
+            'mean': [0.5, 0.5, 0.5],
+            'std': [0.5, 0.5, 0.5],
+        }
+    elif backbone in NORMALIZED_IMAGENET:
+        return {
+            'mean': [0.485, 0.456, 0.406],
+            'std': [0.229, 0.224, 0.225],
+        }
+    elif backbone in NORMALIZED_CLIP:
+        return {
+            'mean': [0.48145466, 0.4578275, 0.40821073],
+            'std': [0.26862954, 0.26130258, 0.27577711],
+        }
+    else:
+        raise NotImplementedError
+def _make_scratch_ccm(scratch, in_channels, cout, expand=False):
+    # shapes
+    out_channels = [cout, cout*2, cout*4, cout*8] if expand else [cout]*4
+    scratch.layer0_ccm = nn.Conv2d(in_channels[0], out_channels[0], kernel_size=1, stride=1, padding=0, bias=True)
+    scratch.layer1_ccm = nn.Conv2d(in_channels[1], out_channels[1], kernel_size=1, stride=1, padding=0, bias=True)
+    scratch.layer2_ccm = nn.Conv2d(in_channels[2], out_channels[2], kernel_size=1, stride=1, padding=0, bias=True)
+    scratch.layer3_ccm = nn.Conv2d(in_channels[3], out_channels[3], kernel_size=1, stride=1, padding=0, bias=True)
+    scratch.CHANNELS = out_channels
+    return scratch
+def _make_scratch_csm(scratch, in_channels, cout, expand):
+    scratch.layer3_csm = FeatureFusionBlock(in_channels[3], nn.ReLU(False), expand=expand, lowest=True)
+    scratch.layer2_csm = FeatureFusionBlock(in_channels[2], nn.ReLU(False), expand=expand)
+    scratch.layer1_csm = FeatureFusionBlock(in_channels[1], nn.ReLU(False), expand=expand)
+    scratch.layer0_csm = FeatureFusionBlock(in_channels[0], nn.ReLU(False))
+    # last refinenet does not expand to save channels in higher dimensions
+    scratch.CHANNELS = [cout, cout, cout*2, cout*4] if expand else [cout]*4
+    return scratch
+def _make_projector(im_res, backbone, cout, proj_type, expand=False):
+    assert proj_type in [0, 1, 2], "Invalid projection type"
+    ### Build pretrained feature network
+    pretrained = _make_pretrained(backbone)
+    # Following Projected GAN
+    im_res = 256
+    pretrained.RESOLUTIONS = [im_res//4, im_res//8, im_res//16, im_res//32]
+    if proj_type == 0: return pretrained, None
+    ### Build CCM
+    scratch = nn.Module()
+    scratch = _make_scratch_ccm(scratch, in_channels=pretrained.CHANNELS, cout=cout, expand=expand)
+    pretrained.CHANNELS = scratch.CHANNELS
+    if proj_type == 1: return pretrained, scratch
+    ### build CSM
+    scratch = _make_scratch_csm(scratch, in_channels=scratch.CHANNELS, cout=cout, expand=expand)
+    # CSM upsamples x2 so the feature map resolution doubles
+    pretrained.RESOLUTIONS = [res*2 for res in pretrained.RESOLUTIONS]
+    pretrained.CHANNELS = scratch.CHANNELS
+    return pretrained, scratch
+class F_Identity(nn.Module):
+    def forward(self, x):
+        return x
+class F_RandomProj(nn.Module):
+    def __init__(
+        self,
+        backbone="tf_efficientnet_lite3",
+        im_res=256,
+        cout=64,
+        expand=True,
+        proj_type=2,  # 0 = no projection, 1 = cross channel mixing, 2 = cross scale mixing
+        **kwargs,
+    ):
+        super().__init__()
+        self.proj_type = proj_type
+        self.backbone = backbone
+        self.cout = cout
+        self.expand = expand
+        self.normstats = get_backbone_normstats(backbone)
+        # build pretrained feature network and random decoder (scratch)
+        self.pretrained, self.scratch = _make_projector(im_res=im_res, backbone=self.backbone, cout=self.cout,
+                                                        proj_type=self.proj_type, expand=self.expand)
+        self.CHANNELS = self.pretrained.CHANNELS
+        self.RESOLUTIONS = self.pretrained.RESOLUTIONS
+    def forward(self, x):
+        # predict feature maps
+        if self.backbone in VITS:
+            out0, out1, out2, out3 = forward_vit(self.pretrained, x)
+        else:
+            out0 = self.pretrained.layer0(x)
+            out1 = self.pretrained.layer1(out0)
+            out2 = self.pretrained.layer2(out1)
+            out3 = self.pretrained.layer3(out2)
+        # start enumerating at the lowest layer (this is where we put the first discriminator)
+        out = {
+            '0': out0,
+            '1': out1,
+            '2': out2,
+            '3': out3,
+        }
+        if self.proj_type == 0: return out
+        out0_channel_mixed = self.scratch.layer0_ccm(out['0'])
+        out1_channel_mixed = self.scratch.layer1_ccm(out['1'])
+        out2_channel_mixed = self.scratch.layer2_ccm(out['2'])
+        out3_channel_mixed = self.scratch.layer3_ccm(out['3'])
+        out = {
+            '0': out0_channel_mixed,
+            '1': out1_channel_mixed,
+            '2': out2_channel_mixed,
+            '3': out3_channel_mixed,
+        }
+        if self.proj_type == 1: return out
+        # from bottom to top
+        out3_scale_mixed = self.scratch.layer3_csm(out3_channel_mixed)
+        out2_scale_mixed = self.scratch.layer2_csm(out3_scale_mixed, out2_channel_mixed)
+        out1_scale_mixed = self.scratch.layer1_csm(out2_scale_mixed, out1_channel_mixed)
+        out0_scale_mixed = self.scratch.layer0_csm(out1_scale_mixed, out0_channel_mixed)
+        out = {
+            '0': out0_scale_mixed,
+            '1': out1_scale_mixed,
+            '2': out2_scale_mixed,
+            '3': out3_scale_mixed,
+        }
+        return out