nateraw
/

stable-diffusion-walk

Model card Files Files and versions

xet

Community

nateraw commited on Oct 4, 2022

Commit

006a2b7

1 Parent(s): ff645b2

Create new file

Browse files

Files changed (1) hide show

pipeline.py +674 -0

pipeline.py ADDED Viewed

	@@ -0,0 +1,674 @@

+import inspect
+import json
+import subprocess
+from pathlib import Path
+from typing import Callable, List, Optional, Union
+import numpy as np
+import torch
+from PIL import Image
+import cv2
+from diffusers.configuration_utils import FrozenDict
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from diffusers.schedulers import DDIMScheduler, LMSDiscreteScheduler, PNDMScheduler
+from diffusers.utils import deprecate, logging
+from huggingface_hub import hf_hub_download
+from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+default_scheduler = PNDMScheduler(
+    beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear"
+)
+ddim_scheduler = DDIMScheduler(
+    beta_start=0.00085,
+    beta_end=0.012,
+    beta_schedule="scaled_linear",
+    clip_sample=False,
+    set_alpha_to_one=False,
+)
+klms_scheduler = LMSDiscreteScheduler(
+    beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear"
+)
+SCHEDULERS = dict(default=default_scheduler, ddim=ddim_scheduler, klms=klms_scheduler)
+def slerp(t, v0, v1, DOT_THRESHOLD=0.9995):
+    """helper function to spherically interpolate two arrays v1 v2"""
+    if not isinstance(v0, np.ndarray):
+        inputs_are_torch = True
+        input_device = v0.device
+        v0 = v0.cpu().numpy()
+        v1 = v1.cpu().numpy()
+    dot = np.sum(v0 * v1 / (np.linalg.norm(v0) * np.linalg.norm(v1)))
+    if np.abs(dot) > DOT_THRESHOLD:
+        v2 = (1 - t) * v0 + t * v1
+    else:
+        theta_0 = np.arccos(dot)
+        sin_theta_0 = np.sin(theta_0)
+        theta_t = theta_0 * t
+        sin_theta_t = np.sin(theta_t)
+        s0 = np.sin(theta_0 - theta_t) / sin_theta_0
+        s1 = sin_theta_t / sin_theta_0
+        v2 = s0 * v0 + s1 * v1
+    if inputs_are_torch:
+        v2 = torch.from_numpy(v2).to(input_device)
+    return v2
+class RealESRGANModel(torch.nn.Module):
+    def __init__(self, model_path, tile=0, tile_pad=10, pre_pad=0, fp32=False):
+        super().__init__()
+        try:
+            from basicsr.archs.rrdbnet_arch import RRDBNet
+            from realesrgan import RealESRGANer
+        except ImportError as e:
+            raise ImportError(
+                "You tried to import realesrgan without having it installed properly. To install Real-ESRGAN, run:\n\n"
+                "pip install realesrgan"
+            )
+        model = RRDBNet(num_in_ch=3, num_out_ch=3, num_feat=64, num_block=23, num_grow_ch=32, scale=4)
+        self.upsampler = RealESRGANer(
+            scale=4,
+            model_path=model_path,
+            model=model,
+            tile=tile,
+            tile_pad=tile_pad,
+            pre_pad=pre_pad,
+            half=not fp32
+        )
+    def forward(self, image, outscale=4, convert_to_pil=True):
+        """Upsample an image array or path.
+        Args:
+            image (Union[np.ndarray, str]): Either a np array or an image path. np array is assumed to be in RGB format,
+                and we convert it to BGR.
+            outscale (int, optional): Amount to upscale the image. Defaults to 4.
+            convert_to_pil (bool, optional): If True, return PIL image. Otherwise, return numpy array (BGR). Defaults to True.
+        Returns:
+            Union[np.ndarray, PIL.Image.Image]: An upsampled version of the input image.
+        """
+        if isinstance(image, (str, Path)):
+            img = cv2.imread(image, cv2.IMREAD_UNCHANGED)
+        else:
+            img = image
+            img = (img * 255).round().astype("uint8")
+            img = img[:, :, ::-1]
+        image, _ = self.upsampler.enhance(img, outscale=outscale)
+        if convert_to_pil:
+            image = Image.fromarray(image[:, :, ::-1])
+        return image
+    @classmethod
+    def from_pretrained(cls, model_name_or_path='nateraw/real-esrgan'):
+        """Initialize a pretrained Real-ESRGAN upsampler.
+        Example:
+            ```python
+            >>> from stable_diffusion_videos import PipelineRealESRGAN
+            >>> pipe = PipelineRealESRGAN.from_pretrained('nateraw/real-esrgan')
+            >>> im_out = pipe('input_img.jpg')
+            ```
+        Args:
+            model_name_or_path (str, optional): The Hugging Face repo ID or path to local model. Defaults to 'nateraw/real-esrgan'.
+        Returns:
+            stable_diffusion_videos.PipelineRealESRGAN: An instance of `PipelineRealESRGAN` instantiated from pretrained model.
+        """
+        # reuploaded form official ones mentioned here:
+        # https://github.com/xinntao/Real-ESRGAN
+        if Path(model_name_or_path).exists():
+            file = model_name_or_path
+        else:
+            file = hf_hub_download(model_name_or_path, 'RealESRGAN_x4plus.pth')
+        return cls(file)
+    def upsample_imagefolder(self, in_dir, out_dir, suffix='out', outfile_ext='.png'):
+        in_dir, out_dir = Path(in_dir), Path(out_dir)
+        if not in_dir.exists():
+            raise FileNotFoundError(f"Provided input directory {in_dir} does not exist")
+        out_dir.mkdir(exist_ok=True, parents=True)
+        image_paths = [x for x in in_dir.glob('*') if x.suffix.lower() in ['.png', '.jpg', '.jpeg']]
+        for image in image_paths:
+            im = self(str(image))
+            out_filepath = out_dir / (image.stem + suffix + outfile_ext)
+            im.save(out_filepath)
+class NoUpsamplingModel(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+    def forward(self, images):
+        return images
+def make_video_ffmpeg(frame_dir, output_file_name='output.mp4', frame_filename="frame%06d.png", fps=30):
+    frame_ref_path = str(frame_dir / frame_filename)
+    video_path = str(frame_dir / output_file_name)
+    subprocess.call(
+        f"ffmpeg -r {fps} -i {frame_ref_path} -vcodec libx264 -crf 10 -pix_fmt yuv420p"
+        f" {video_path}".split()
+    )
+    return video_path
+class StableDiffusionWalkPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for generating videos by interpolating  Stable Diffusion's latent space.
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for details.
+        feature_extractor ([`CLIPFeatureExtractor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: Union[DDIMScheduler, PNDMScheduler, LMSDiscreteScheduler],
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPFeatureExtractor,
+    ):
+        super().__init__()
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+    def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
+        r"""
+        Enable sliced attention computation.
+        When this option is enabled, the attention module will split the input tensor in slices, to compute attention
+        in several steps. This is useful to save some memory in exchange for a small speed decrease.
+        Args:
+            slice_size (`str` or `int`, *optional*, defaults to `"auto"`):
+                When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
+                a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case,
+                `attention_head_dim` must be a multiple of `slice_size`.
+        """
+        if slice_size == "auto":
+            # half the attention head size is usually a good trade-off between
+            # speed and memory
+            slice_size = self.unet.config.attention_head_dim // 2
+        self.unet.set_attention_slice(slice_size)
+    def disable_attention_slicing(self):
+        r"""
+        Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go
+        back to computing attention in one step.
+        """
+        # set slice_size = `None` to disable `attention slicing`
+        self.enable_attention_slicing(None)
+    @torch.no_grad()
+    def step(
+        self,
+        prompt: Optional[Union[str, List[str]]] = None,
+        height: int = 512,
+        width: int = 512,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        eta: float = 0.0,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        text_embeddings: Optional[torch.FloatTensor] = None,
+        **kwargs,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`):
+                The prompt or prompts to guide the image generation.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            text_embeddings(`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings.
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+        if text_embeddings is None:
+            if isinstance(prompt, str):
+                batch_size = 1
+            elif isinstance(prompt, list):
+                batch_size = len(prompt)
+            else:
+                raise ValueError(
+                    f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
+                )
+            # get prompt text embeddings
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
+                removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
+                logger.warning(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+                text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
+            text_embeddings = self.text_encoder(text_input_ids.to(self.device))[0]
+        else:
+            batch_size = text_embeddings.shape[0]
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            # HACK - Not setting text_input_ids here when walking, so hard coding to max length of tokenizer
+            # TODO - Determine if this is OK to do
+            # max_length = text_input_ids.shape[-1]
+            max_length = self.tokenizer.model_max_length
+            uncond_input = self.tokenizer(
+                [""] * batch_size, padding="max_length", max_length=max_length, return_tensors="pt"
+            )
+            uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(self.device))[0]
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+        # get the initial random noise unless the user supplied it
+        # Unlike in other pipelines, latents need to be generated in the target device
+        # for 1-to-1 results reproducibility with the CompVis implementation.
+        # However this currently doesn't work in `mps`.
+        latents_device = "cpu" if self.device.type == "mps" else self.device
+        latents_shape = (batch_size, self.unet.in_channels, height // 8, width // 8)
+        if latents is None:
+            latents = torch.randn(
+                latents_shape,
+                generator=generator,
+                device=latents_device,
+                dtype=text_embeddings.dtype,
+            )
+        else:
+            if latents.shape != latents_shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
+            latents = latents.to(latents_device)
+        # set timesteps
+        self.scheduler.set_timesteps(num_inference_steps)
+        # Some schedulers like PNDM have timesteps as arrays
+        # It's more optimzed to move all timesteps to correct device beforehand
+        if torch.is_tensor(self.scheduler.timesteps):
+            timesteps_tensor = self.scheduler.timesteps.to(self.device)
+        else:
+            timesteps_tensor = torch.tensor(self.scheduler.timesteps.copy(), device=self.device)
+        # if we use LMSDiscreteScheduler, let's make sure latents are multiplied by sigmas
+        if isinstance(self.scheduler, LMSDiscreteScheduler):
+            latents = latents * self.scheduler.sigmas[0]
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+        for i, t in enumerate(self.progress_bar(timesteps_tensor)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            if isinstance(self.scheduler, LMSDiscreteScheduler):
+                sigma = self.scheduler.sigmas[i]
+                # the model input needs to be scaled to match the continuous ODE formulation in K-LMS
+                latent_model_input = latent_model_input / ((sigma**2 + 1) ** 0.5)
+            # predict the noise residual
+            noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+            # compute the previous noisy sample x_t -> x_t-1
+            if isinstance(self.scheduler, LMSDiscreteScheduler):
+                latents = self.scheduler.step(noise_pred, i, latents, **extra_step_kwargs).prev_sample
+            else:
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+            # call the callback, if provided
+            if callback is not None and i % callback_steps == 0:
+                callback(i, t, latents)
+        latents = 1 / 0.18215 * latents
+        image = self.vae.decode(latents).sample
+        image = (image / 2 + 0.5).clamp(0, 1)
+        image = image.cpu().permute(0, 2, 3, 1).numpy()
+        safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(self.device)
+        image, has_nsfw_concept = self.safety_checker(
+            images=image, clip_input=safety_checker_input.pixel_values.to(text_embeddings.dtype)
+        )
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+        if not return_dict:
+            return (image, has_nsfw_concept)
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
+    def __call__(
+        self,
+        prompts: List[str] = ["blueberry spaghetti", "strawberry spaghetti"],
+        seeds: List[int] = [42, 123],
+        num_interpolation_steps: Union[int, List[int]] = 5,
+        output_dir: str = "dreams",
+        name: str = "berry_good_spaghetti",
+        height: int = 512,
+        width: int = 512,
+        guidance_scale: float = 7.5,
+        eta: float = 0.0,
+        num_inference_steps: int = 50,
+        do_loop: bool = False,
+        make_video: bool = False,
+        use_lerp_for_text: bool = True,
+        scheduler: str = "klms",  # choices: default, ddim, klms
+        disable_tqdm: bool = False,
+        upsample: bool = False,
+        fps: int = 30,
+        resume: bool = False,
+        batch_size: int = 1,
+        frame_filename_ext: str = '.png',
+    ):
+        if upsample:
+            if getattr(self, 'upsampler', None) is None:
+                self.upsampler = RealESRGANModel.from_pretrained('nateraw/real-esrgan')
+            self.upsampler.to(self.device)
+        output_path = Path(output_dir) / name
+        output_path.mkdir(exist_ok=True, parents=True)
+        prompt_config_path = output_path / 'prompt_config.json'
+        if not resume:
+            # Write prompt info to file in output dir so we can keep track of what we did
+            prompt_config_path.write_text(
+                json.dumps(
+                    dict(
+                        prompts=prompts,
+                        seeds=seeds,
+                        num_interpolation_steps=num_interpolation_steps,
+                        name=name,
+                        guidance_scale=guidance_scale,
+                        eta=eta,
+                        num_inference_steps=num_inference_steps,
+                        do_loop=do_loop,
+                        make_video=make_video,
+                        use_lerp_for_text=use_lerp_for_text,
+                        scheduler=scheduler,
+                        upsample=upsample,
+                        fps=fps,
+                        height=height,
+                        width=width,
+                    ),
+                    indent=2,
+                    sort_keys=False,
+                )
+            )
+        else:
+            # When resuming, we load all available info from existing prompt config, using kwargs passed in where necessary
+            if not prompt_config_path.exists():
+                raise FileNotFoundError(f"You specified resume=True, but no prompt config file was found at {prompt_config_path}")
+            data = json.load(open(prompt_config_path))
+            prompts = data['prompts']
+            seeds = data['seeds']
+            # NOTE - num_steps was renamed to num_interpolation_steps. Including it here for backwards compatibility.
+            num_interpolation_steps = data.get('num_interpolation_steps') or data.get('num_steps')
+            height = data['height'] if 'height' in data else height
+            width = data['width'] if 'width' in data else width
+            guidance_scale = data['guidance_scale']
+            eta = data['eta']
+            num_inference_steps = data['num_inference_steps']
+            do_loop = data['do_loop']
+            make_video = data['make_video']
+            use_lerp_for_text = data['use_lerp_for_text']
+            scheduler = data['scheduler']
+            disable_tqdm=disable_tqdm
+            upsample = data['upsample'] if 'upsample' in data else upsample
+            fps = data['fps'] if 'fps' in data else fps
+            resume_step = int(sorted(output_path.glob(f"frame*{frame_filename_ext}"))[-1].stem[5:])
+            print(f"\nResuming {output_path} from step {resume_step}...")
+        self.set_progress_bar_config(disable=disable_tqdm)
+        self.scheduler = SCHEDULERS[scheduler]
+        if isinstance(num_interpolation_steps, int):
+            num_interpolation_steps = [num_interpolation_steps] * (len(prompts)-1)
+        assert len(prompts) == len(seeds) == len(num_interpolation_steps) +1
+        first_prompt, *prompts = prompts
+        embeds_a = self.embed_text(first_prompt)
+        first_seed, *seeds = seeds
+        latents_a = torch.randn(
+            (1, self.unet.in_channels, height // 8, width // 8),
+            device=self.device,
+            generator=torch.Generator(device=self.device).manual_seed(first_seed),
+        )
+        if do_loop:
+            prompts.append(first_prompt)
+            seeds.append(first_seed)
+            num_interpolation_steps.append(num_interpolation_steps[0])
+        frame_index = 0
+        total_frame_count = sum(num_interpolation_steps)
+        for prompt, seed, num_step in zip(prompts, seeds, num_interpolation_steps):
+            # Text
+            embeds_b = self.embed_text(prompt)
+            # Latent Noise
+            latents_b = torch.randn(
+                (1, self.unet.in_channels, height // 8, width // 8),
+                device=self.device,
+                generator=torch.Generator(device=self.device).manual_seed(seed),
+            )
+            latents_batch, embeds_batch = None, None
+            for i, t in enumerate(np.linspace(0, 1, num_step)):
+                frame_filepath = output_path / (f"frame%06d{frame_filename_ext}" % frame_index)
+                if resume and frame_filepath.is_file():
+                    frame_index += 1
+                    continue
+                if use_lerp_for_text:
+                    embeds = torch.lerp(embeds_a, embeds_b, float(t))
+                else:
+                    embeds = slerp(float(t), embeds_a, embeds_b)
+                latents = slerp(float(t), latents_a, latents_b)
+                embeds_batch = embeds if embeds_batch is None else torch.cat([embeds_batch, embeds])
+                latents_batch = latents if latents_batch is None else torch.cat([latents_batch, latents])
+                del embeds
+                del latents
+                torch.cuda.empty_cache()
+                batch_is_ready = embeds_batch.shape[0] == batch_size or t == 1.0
+                if not batch_is_ready:
+                    continue
+                do_print_progress = (i == 0) or ((frame_index) % 20 == 0)
+                if do_print_progress:
+                    print(f"COUNT: {frame_index}/{total_frame_count}")
+                with torch.autocast("cuda"):
+                    outputs = self.step(
+                        latents=latents_batch,
+                        text_embeddings=embeds_batch,
+                        height=height,
+                        width=width,
+                        guidance_scale=guidance_scale,
+                        eta=eta,
+                        num_inference_steps=num_inference_steps,
+                        output_type='pil' if not upsample else 'numpy'
+                    )["sample"]
+                    del embeds_batch
+                    del latents_batch
+                    torch.cuda.empty_cache()
+                    latents_batch, embeds_batch = None, None
+                    if upsample:
+                        images = []
+                        for output in outputs:
+                            images.append(self.upsampler(output))
+                    else:
+                        images = outputs
+                for image in images:
+                    frame_filepath = output_path / (f"frame%06d{frame_filename_ext}" % frame_index)
+                    image.save(frame_filepath)
+                    frame_index += 1
+            embeds_a = embeds_b
+            latents_a = latents_b
+        if make_video:
+            return make_video_ffmpeg(output_path, f"{name}.mp4", fps=fps, frame_filename=f"frame%06d{frame_filename_ext}")
+    def embed_text(self, text):
+        """Helper to embed some text"""
+        with torch.autocast("cuda"):
+            text_input = self.tokenizer(
+                text,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            with torch.no_grad():
+                embed = self.text_encoder(text_input.input_ids.to(self.device))[0]
+        return embed