| | import re
|
| | from abc import abstractmethod
|
| | from contextlib import contextmanager
|
| | from typing import Any, Dict, Tuple, Union
|
| |
|
| | import pytorch_lightning as pl
|
| | import torch
|
| | from omegaconf import ListConfig
|
| | from packaging import version
|
| | from safetensors.torch import load_file as load_safetensors
|
| |
|
| | from ..modules.diffusionmodules.model import Decoder, Encoder
|
| | from ..modules.distributions.distributions import DiagonalGaussianDistribution
|
| | from ..modules.ema import LitEma
|
| | from ..util import default, get_obj_from_str, instantiate_from_config
|
| |
|
| |
|
| | class AbstractAutoencoder(pl.LightningModule):
|
| | """
|
| | This is the base class for all autoencoders, including image autoencoders, image autoencoders with discriminators,
|
| | unCLIP models, etc. Hence, it is fairly general, and specific features
|
| | (e.g. discriminator training, encoding, decoding) must be implemented in subclasses.
|
| | """
|
| |
|
| | def __init__(
|
| | self,
|
| | ema_decay: Union[None, float] = None,
|
| | monitor: Union[None, str] = None,
|
| | input_key: str = "jpg",
|
| | ckpt_path: Union[None, str] = None,
|
| | ignore_keys: Union[Tuple, list, ListConfig] = (),
|
| | ):
|
| | super().__init__()
|
| | self.input_key = input_key
|
| | self.use_ema = ema_decay is not None
|
| | if monitor is not None:
|
| | self.monitor = monitor
|
| |
|
| | if self.use_ema:
|
| | self.model_ema = LitEma(self, decay=ema_decay)
|
| | print(f"Keeping EMAs of {len(list(self.model_ema.buffers()))}.")
|
| |
|
| | if ckpt_path is not None:
|
| | self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
|
| |
|
| | if version.parse(torch.__version__) >= version.parse("2.0.0"):
|
| | self.automatic_optimization = False
|
| |
|
| | def init_from_ckpt(
|
| | self, path: str, ignore_keys: Union[Tuple, list, ListConfig] = tuple()
|
| | ) -> None:
|
| | if path.endswith("ckpt"):
|
| | sd = torch.load(path, map_location="cpu")["state_dict"]
|
| | elif path.endswith("safetensors"):
|
| | sd = load_safetensors(path)
|
| | else:
|
| | raise NotImplementedError
|
| |
|
| | keys = list(sd.keys())
|
| | for k in keys:
|
| | for ik in ignore_keys:
|
| | if re.match(ik, k):
|
| | print("Deleting key {} from state_dict.".format(k))
|
| | del sd[k]
|
| | missing, unexpected = self.load_state_dict(sd, strict=False)
|
| | print(
|
| | f"Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys"
|
| | )
|
| | if len(missing) > 0:
|
| | print(f"Missing Keys: {missing}")
|
| | if len(unexpected) > 0:
|
| | print(f"Unexpected Keys: {unexpected}")
|
| |
|
| | @abstractmethod
|
| | def get_input(self, batch) -> Any:
|
| | raise NotImplementedError()
|
| |
|
| | def on_train_batch_end(self, *args, **kwargs):
|
| |
|
| | if self.use_ema:
|
| | self.model_ema(self)
|
| |
|
| | @contextmanager
|
| | def ema_scope(self, context=None):
|
| | if self.use_ema:
|
| | self.model_ema.store(self.parameters())
|
| | self.model_ema.copy_to(self)
|
| | if context is not None:
|
| | print(f"{context}: Switched to EMA weights")
|
| | try:
|
| | yield None
|
| | finally:
|
| | if self.use_ema:
|
| | self.model_ema.restore(self.parameters())
|
| | if context is not None:
|
| | print(f"{context}: Restored training weights")
|
| |
|
| | @abstractmethod
|
| | def encode(self, *args, **kwargs) -> torch.Tensor:
|
| | raise NotImplementedError("encode()-method of abstract base class called")
|
| |
|
| | @abstractmethod
|
| | def decode(self, *args, **kwargs) -> torch.Tensor:
|
| | raise NotImplementedError("decode()-method of abstract base class called")
|
| |
|
| | def instantiate_optimizer_from_config(self, params, lr, cfg):
|
| | print(f"loading >>> {cfg['target']} <<< optimizer from config")
|
| | return get_obj_from_str(cfg["target"])(
|
| | params, lr=lr, **cfg.get("params", dict())
|
| | )
|
| |
|
| | def configure_optimizers(self) -> Any:
|
| | raise NotImplementedError()
|
| |
|
| |
|
| | class AutoencodingEngine(AbstractAutoencoder):
|
| | """
|
| | Base class for all image autoencoders that we train, like VQGAN or AutoencoderKL
|
| | (we also restore them explicitly as special cases for legacy reasons).
|
| | Regularizations such as KL or VQ are moved to the regularizer class.
|
| | """
|
| |
|
| | def __init__(
|
| | self,
|
| | *args,
|
| | encoder_config: Dict,
|
| | decoder_config: Dict,
|
| | loss_config: Dict,
|
| | regularizer_config: Dict,
|
| | optimizer_config: Union[Dict, None] = None,
|
| | lr_g_factor: float = 1.0,
|
| | **kwargs,
|
| | ):
|
| | super().__init__(*args, **kwargs)
|
| |
|
| | self.encoder = instantiate_from_config(encoder_config)
|
| | self.decoder = instantiate_from_config(decoder_config)
|
| | self.loss = instantiate_from_config(loss_config)
|
| | self.regularization = instantiate_from_config(regularizer_config)
|
| | self.optimizer_config = default(
|
| | optimizer_config, {"target": "torch.optim.Adam"}
|
| | )
|
| | self.lr_g_factor = lr_g_factor
|
| |
|
| | def get_input(self, batch: Dict) -> torch.Tensor:
|
| |
|
| |
|
| | return batch[self.input_key]
|
| |
|
| | def get_autoencoder_params(self) -> list:
|
| | params = (
|
| | list(self.encoder.parameters())
|
| | + list(self.decoder.parameters())
|
| | + list(self.regularization.get_trainable_parameters())
|
| | + list(self.loss.get_trainable_autoencoder_parameters())
|
| | )
|
| | return params
|
| |
|
| | def get_discriminator_params(self) -> list:
|
| | params = list(self.loss.get_trainable_parameters())
|
| | return params
|
| |
|
| | def get_last_layer(self):
|
| | return self.decoder.get_last_layer()
|
| |
|
| | def encode(self, x: Any, return_reg_log: bool = False) -> Any:
|
| | z = self.encoder(x)
|
| | z, reg_log = self.regularization(z)
|
| | if return_reg_log:
|
| | return z, reg_log
|
| | return z
|
| |
|
| | def decode(self, z: Any) -> torch.Tensor:
|
| | x = self.decoder(z)
|
| | return x
|
| |
|
| | def forward(self, x: Any) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
| | z, reg_log = self.encode(x, return_reg_log=True)
|
| | dec = self.decode(z)
|
| | return z, dec, reg_log
|
| |
|
| | def training_step(self, batch, batch_idx, optimizer_idx) -> Any:
|
| | x = self.get_input(batch)
|
| | z, xrec, regularization_log = self(x)
|
| |
|
| | if optimizer_idx == 0:
|
| |
|
| | aeloss, log_dict_ae = self.loss(
|
| | regularization_log,
|
| | x,
|
| | xrec,
|
| | optimizer_idx,
|
| | self.global_step,
|
| | last_layer=self.get_last_layer(),
|
| | split="train",
|
| | )
|
| |
|
| | self.log_dict(
|
| | log_dict_ae, prog_bar=False, logger=True, on_step=True, on_epoch=True
|
| | )
|
| | return aeloss
|
| |
|
| | if optimizer_idx == 1:
|
| |
|
| | discloss, log_dict_disc = self.loss(
|
| | regularization_log,
|
| | x,
|
| | xrec,
|
| | optimizer_idx,
|
| | self.global_step,
|
| | last_layer=self.get_last_layer(),
|
| | split="train",
|
| | )
|
| | self.log_dict(
|
| | log_dict_disc, prog_bar=False, logger=True, on_step=True, on_epoch=True
|
| | )
|
| | return discloss
|
| |
|
| | def validation_step(self, batch, batch_idx) -> Dict:
|
| | log_dict = self._validation_step(batch, batch_idx)
|
| | with self.ema_scope():
|
| | log_dict_ema = self._validation_step(batch, batch_idx, postfix="_ema")
|
| | log_dict.update(log_dict_ema)
|
| | return log_dict
|
| |
|
| | def _validation_step(self, batch, batch_idx, postfix="") -> Dict:
|
| | x = self.get_input(batch)
|
| |
|
| | z, xrec, regularization_log = self(x)
|
| | aeloss, log_dict_ae = self.loss(
|
| | regularization_log,
|
| | x,
|
| | xrec,
|
| | 0,
|
| | self.global_step,
|
| | last_layer=self.get_last_layer(),
|
| | split="val" + postfix,
|
| | )
|
| |
|
| | discloss, log_dict_disc = self.loss(
|
| | regularization_log,
|
| | x,
|
| | xrec,
|
| | 1,
|
| | self.global_step,
|
| | last_layer=self.get_last_layer(),
|
| | split="val" + postfix,
|
| | )
|
| | self.log(f"val{postfix}/rec_loss", log_dict_ae[f"val{postfix}/rec_loss"])
|
| | log_dict_ae.update(log_dict_disc)
|
| | self.log_dict(log_dict_ae)
|
| | return log_dict_ae
|
| |
|
| | def configure_optimizers(self) -> Any:
|
| | ae_params = self.get_autoencoder_params()
|
| | disc_params = self.get_discriminator_params()
|
| |
|
| | opt_ae = self.instantiate_optimizer_from_config(
|
| | ae_params,
|
| | default(self.lr_g_factor, 1.0) * self.learning_rate,
|
| | self.optimizer_config,
|
| | )
|
| | opt_disc = self.instantiate_optimizer_from_config(
|
| | disc_params, self.learning_rate, self.optimizer_config
|
| | )
|
| |
|
| | return [opt_ae, opt_disc], []
|
| |
|
| | @torch.no_grad()
|
| | def log_images(self, batch: Dict, **kwargs) -> Dict:
|
| | log = dict()
|
| | x = self.get_input(batch)
|
| | _, xrec, _ = self(x)
|
| | log["inputs"] = x
|
| | log["reconstructions"] = xrec
|
| | with self.ema_scope():
|
| | _, xrec_ema, _ = self(x)
|
| | log["reconstructions_ema"] = xrec_ema
|
| | return log
|
| |
|
| |
|
| | class AutoencoderKL(AutoencodingEngine):
|
| | def __init__(self, embed_dim: int, **kwargs):
|
| | ddconfig = kwargs.pop("ddconfig")
|
| | ckpt_path = kwargs.pop("ckpt_path", None)
|
| | ignore_keys = kwargs.pop("ignore_keys", ())
|
| | super().__init__(
|
| | encoder_config={"target": "torch.nn.Identity"},
|
| | decoder_config={"target": "torch.nn.Identity"},
|
| | regularizer_config={"target": "torch.nn.Identity"},
|
| | loss_config=kwargs.pop("lossconfig"),
|
| | **kwargs,
|
| | )
|
| | assert ddconfig["double_z"]
|
| | self.encoder = Encoder(**ddconfig)
|
| | self.decoder = Decoder(**ddconfig)
|
| | self.quant_conv = torch.nn.Conv2d(2 * ddconfig["z_channels"], 2 * embed_dim, 1)
|
| | self.post_quant_conv = torch.nn.Conv2d(embed_dim, ddconfig["z_channels"], 1)
|
| | self.embed_dim = embed_dim
|
| |
|
| | if ckpt_path is not None:
|
| | self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys)
|
| |
|
| | def encode(self, x):
|
| | assert (
|
| | not self.training
|
| | ), f"{self.__class__.__name__} only supports inference currently"
|
| | h = self.encoder(x)
|
| | moments = self.quant_conv(h)
|
| | posterior = DiagonalGaussianDistribution(moments)
|
| | return posterior
|
| |
|
| | def decode(self, z, **decoder_kwargs):
|
| | z = self.post_quant_conv(z)
|
| | dec = self.decoder(z, **decoder_kwargs)
|
| | return dec
|
| |
|
| |
|
| | class AutoencoderKLInferenceWrapper(AutoencoderKL):
|
| | def encode(self, x):
|
| | return super().encode(x).sample()
|
| |
|
| |
|
| | class IdentityFirstStage(AbstractAutoencoder):
|
| | def __init__(self, *args, **kwargs):
|
| | super().__init__(*args, **kwargs)
|
| |
|
| | def get_input(self, x: Any) -> Any:
|
| | return x
|
| |
|
| | def encode(self, x: Any, *args, **kwargs) -> Any:
|
| | return x
|
| |
|
| | def decode(self, x: Any, *args, **kwargs) -> Any:
|
| | return x
|
| |
|