camenduru commited on Feb 1, 2023

Commit

857af3d

1 Parent(s): 8d1582a

content

Browse files

Files changed (18) hide show

.gitattributes +3 -0
.gitignore +4 -0
README.md +99 -0
__pycache__/models.cpython-38.pyc +0 -0
__pycache__/utilities.cpython-38.pyc +0 -0
demo-diffusion.py +501 -0
engine/clip.plan +3 -0
engine/unet_fp16.plan +3 -0
engine/vae.plan +3 -0
models.py +980 -0
onnx/clip.onnx +3 -0
onnx/clip.opt.onnx +3 -0
onnx/unet_fp16.onnx +3 -0
onnx/unet_fp16.opt.onnx +3 -0
onnx/vae.onnx +3 -0
onnx/vae.opt.onnx +3 -0
requirements.txt +15 -0
utilities.py +537 -0

.gitattributes CHANGED Viewed

@@ -32,3 +32,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+engine/clip.plan filter=lfs diff=lfs merge=lfs -text
+engine/unet_fp16.plan filter=lfs diff=lfs merge=lfs -text
+engine/vae.plan filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,4 @@

+__pycache__/
+onnx/*.onnx
+engine/*.plan
+output/*.png

README.md ADDED Viewed

	@@ -0,0 +1,99 @@

+# Introduction
+This demo application ("demoDiffusion") showcases the acceleration of [Stable Diffusion](https://huggingface.co/CompVis/stable-diffusion-v1-4) pipeline using TensorRT plugins.
+# Setup
+### Clone the TensorRT OSS repository
+```bash
+git clone git@github.com:NVIDIA/TensorRT.git -b release/8.5 --single-branch
+cd TensorRT
+git submodule update --init --recursive
+```
+### Launch TensorRT NGC container
+Install nvidia-docker using [these intructions](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#docker).
+```bash
+docker run --rm -it --gpus all -v $PWD:/workspace nvcr.io/nvidia/tensorrt:22.10-py3 /bin/bash
+```
+### (Optional) Install latest TensorRT release
+```bash
+python3 -m pip install --upgrade pip
+python3 -m pip install --upgrade tensorrt
+```
+> NOTE: Alternatively, you can download and install TensorRT packages from [NVIDIA TensorRT Developer Zone](https://developer.nvidia.com/tensorrt).
+### Build TensorRT plugins library
+Build TensorRT Plugins library using the [TensorRT OSS build instructions](https://github.com/NVIDIA/TensorRT/blob/main/README.md#building-tensorrt-oss).
+```bash
+export TRT_OSSPATH=/workspace
+cd $TRT_OSSPATH
+mkdir -p build && cd build
+cmake .. -DTRT_OUT_DIR=$PWD/out
+cd plugin
+make -j$(nproc)
+export PLUGIN_LIBS="$TRT_OSSPATH/build/out/libnvinfer_plugin.so"
+```
+### Install required packages
+```bash
+cd $TRT_OSSPATH/demo/Diffusion
+pip3 install -r requirements.txt
+# Create output directories
+mkdir -p onnx engine output
+```
+> NOTE: demoDiffusion has been tested on systems with NVIDIA A100, RTX3090, and RTX4090 GPUs, and the following software configuration.
+```
+cuda-python         11.8.1
+diffusers           0.7.2
+onnx                1.12.0
+onnx-graphsurgeon   0.3.25
+onnxruntime         1.13.1
+polygraphy          0.43.1
+tensorrt            8.5.1.7
+tokenizers          0.13.2
+torch               1.12.0+cu116
+transformers        4.24.0
+```
+> NOTE: optionally install HuggingFace [accelerate](https://pypi.org/project/accelerate/) package for faster and less memory-intense model loading.
+# Running demoDiffusion
+### Review usage instructions
+```bash
+python3 demo-diffusion.py --help
+```
+### HuggingFace user access token
+To download the model checkpoints for the Stable Diffusion pipeline, you will need a `read` access token. See [instructions](https://huggingface.co/docs/hub/security-tokens).
+```bash
+export HF_TOKEN=<your access token>
+```
+### Generate an image guided by a single text prompt
+```bash
+LD_PRELOAD=${PLUGIN_LIBS} python3 demo-diffusion.py "a beautiful photograph of Mt. Fuji during cherry blossom" --hf-token=$HF_TOKEN -v
+```
+# Restrictions
+- Upto 16 simultaneous prompts (maximum batch size) per inference.
+- For generating images of dynamic shapes without rebuilding the engines, use `--force-dynamic-shape`.
+- Supports images sizes between 256x256 and 1024x1024.

__pycache__/models.cpython-38.pyc ADDED Viewed

Binary file (29.3 kB). View file

__pycache__/utilities.cpython-38.pyc ADDED Viewed

Binary file (16.1 kB). View file

demo-diffusion.py ADDED Viewed

	@@ -0,0 +1,501 @@

+#
+# SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import argparse
+from cuda import cudart
+from models import CLIP, UNet, VAE
+import numpy as np
+import nvtx
+import os
+import onnx
+from polygraphy import cuda
+import time
+import torch
+from transformers import CLIPTokenizer
+import tensorrt as trt
+from utilities import Engine, DPMScheduler, LMSDiscreteScheduler, save_image, TRT_LOGGER
+def parseArgs():
+    parser = argparse.ArgumentParser(description="Options for Stable Diffusion Demo")
+    # Stable Diffusion configuration
+    parser.add_argument('prompt', nargs = '*', help="Text prompt(s) to guide image generation")
+    parser.add_argument('--negative-prompt', nargs = '*', default=[''], help="The negative prompt(s) to guide the image generation.")
+    parser.add_argument('--repeat-prompt', type=int, default=1, choices=[1, 2, 4, 8, 16], help="Number of times to repeat the prompt (batch size multiplier)")
+    parser.add_argument('--height', type=int, default=512, help="Height of image to generate (must be multiple of 8)")
+    parser.add_argument('--width', type=int, default=512, help="Height of image to generate (must be multiple of 8)")
+    parser.add_argument('--num-images', type=int, default=1, help="Number of images to generate per prompt")
+    parser.add_argument('--denoising-steps', type=int, default=50, help="Number of denoising steps")
+    parser.add_argument('--denoising-prec', type=str, default='fp16', choices=['fp32', 'fp16'], help="Denoiser model precision")
+    parser.add_argument('--scheduler', type=str, default="LMSD", choices=["LMSD", "DPM"], help="Scheduler for diffusion process")
+    # ONNX export
+    parser.add_argument('--onnx-opset', type=int, default=16, choices=range(7,18), help="Select ONNX opset version to target for exported models")
+    parser.add_argument('--onnx-dir', default='onnx', help="Output directory for ONNX export")
+    parser.add_argument('--force-onnx-export', action='store_true', help="Force ONNX export of CLIP, UNET, and VAE models")
+    parser.add_argument('--force-onnx-optimize', action='store_true', help="Force ONNX optimizations for CLIP, UNET, and VAE models")
+    parser.add_argument('--onnx-minimal-optimization', action='store_true', help="Restrict ONNX optimization to const folding and shape inference.")
+    # TensorRT engine build
+    parser.add_argument('--engine-dir', default='engine', help="Output directory for TensorRT engines")
+    parser.add_argument('--force-engine-build', action='store_true', help="Force rebuilding the TensorRT engine")
+    parser.add_argument('--build-static-batch', action='store_true', help="Build TensorRT engines with fixed batch size.")
+    parser.add_argument('--build-dynamic-shape', action='store_true', help="Build TensorRT engines with dynamic image shapes.")
+    parser.add_argument('--build-preview-features', action='store_true', help="Build TensorRT engines with preview features.")
+    # TensorRT inference
+    parser.add_argument('--num-warmup-runs', type=int, default=5, help="Number of warmup runs before benchmarking performance")
+    parser.add_argument('--nvtx-profile', action='store_true', help="Enable NVTX markers for performance profiling")
+    parser.add_argument('--seed', type=int, default=None, help="Seed for random generator to get consistent results")
+    parser.add_argument('--output-dir', default='output', help="Output directory for logs and image artifacts")
+    parser.add_argument('--hf-token', type=str, help="HuggingFace API access token for downloading model checkpoints")
+    parser.add_argument('-v', '--verbose', action='store_true', help="Show verbose output")
+    return parser.parse_args()
+class DemoDiffusion:
+    """
+    Application showcasing the acceleration of Stable Diffusion v1.4 pipeline using NVidia TensorRT w/ Plugins.
+    """
+    def __init__(
+        self,
+        denoising_steps,
+        denoising_fp16=True,
+        scheduler="LMSD",
+        guidance_scale=7.5,
+        device='cuda',
+        output_dir='.',
+        hf_token=None,
+        verbose=False,
+        nvtx_profile=False,
+        max_batch_size=16
+    ):
+        """
+        Initializes the Diffusion pipeline.
+        Args:
+            denoising_steps (int):
+                The number of denoising steps.
+                More denoising steps usually lead to a higher quality image at the expense of slower inference.
+            denoising_fp16 (bool):
+                Run the denoising loop (UNet) in fp16 precision.
+                When enabled image quality will be lower but generally results in higher throughput.
+            guidance_scale (float):
+                Guidance scale is enabled by setting as > 1.
+                Higher guidance scale encourages to generate images that are closely linked to the text prompt, usually at the expense of lower image quality.
+            device (str):
+                PyTorch device to run inference. Default: 'cuda'
+            output_dir (str):
+                Output directory for log files and image artifacts
+            hf_token (str):
+                HuggingFace User Access Token to use for downloading Stable Diffusion model checkpoints.
+            verbose (bool):
+                Enable verbose logging.
+            nvtx_profile (bool):
+                Insert NVTX profiling markers.
+            max_batch_size (int):
+                Max batch size for dynamic batch engines.
+        """
+        # Only supports single image per prompt.
+        self.num_images = 1
+        self.denoising_steps = denoising_steps
+        self.denoising_fp16 = denoising_fp16
+        assert guidance_scale > 1.0
+        self.guidance_scale = guidance_scale
+        self.output_dir = output_dir
+        self.hf_token = hf_token
+        self.device = device
+        self.verbose = verbose
+        self.nvtx_profile = nvtx_profile
+        # A scheduler to be used in combination with unet to denoise the encoded image latens.
+        # This demo uses an adaptation of LMSDiscreteScheduler or DPMScheduler:
+        sched_opts = {'num_train_timesteps': 1000, 'beta_start': 0.00085, 'beta_end': 0.012}
+        if scheduler == "DPM":
+            self.scheduler = DPMScheduler(device=self.device, **sched_opts)
+        elif scheduler == "LMSD":
+            self.scheduler = LMSDiscreteScheduler(device=self.device, **sched_opts)
+        else:
+            raise ValueError(f"Scheduler should be either DPM or LMSD")
+        self.tokenizer = None
+        self.unet_model_key = 'unet_fp16' if denoising_fp16 else 'unet'
+        self.models = {
+            'clip': CLIP(hf_token=hf_token, device=device, verbose=verbose, max_batch_size=max_batch_size),
+            self.unet_model_key: UNet(hf_token=hf_token, fp16=denoising_fp16, device=device, verbose=verbose, max_batch_size=max_batch_size),
+            'vae': VAE(hf_token=hf_token, device=device, verbose=verbose, max_batch_size=max_batch_size)
+        }
+        self.engine = {}
+        self.stream = cuda.Stream()
+    def teardown(self):
+        for engine in self.engine.values():
+            del engine
+        self.stream.free()
+        del self.stream
+    def getModelPath(self, name, onnx_dir, opt=True):
+        return os.path.join(onnx_dir, name+('.opt' if opt else '')+'.onnx')
+    def loadEngines(
+        self,
+        engine_dir,
+        onnx_dir,
+        onnx_opset,
+        opt_batch_size,
+        opt_image_height,
+        opt_image_width,
+        force_export=False,
+        force_optimize=False,
+        force_build=False,
+        minimal_optimization=False,
+        static_batch=False,
+        static_shape=True,
+        enable_preview=False,
+    ):
+        """
+        Build and load engines for TensorRT accelerated inference.
+        Export ONNX models first, if applicable.
+        Args:
+            engine_dir (str):
+                Directory to write the TensorRT engines.
+            onnx_dir (str):
+                Directory to write the ONNX models.
+            onnx_opset (int):
+                ONNX opset version to export the models.
+            opt_batch_size (int):
+                Batch size to optimize for during engine building.
+            opt_image_height (int):
+                Image height to optimize for during engine building. Must be a multiple of 8.
+            opt_image_width (int):
+                Image width to optimize for during engine building. Must be a multiple of 8.
+            force_export (bool):
+                Force re-exporting the ONNX models.
+            force_optimize (bool):
+                Force re-optimizing the ONNX models.
+            force_build (bool):
+                Force re-building the TensorRT engine.
+            minimal_optimization (bool):
+                Apply minimal optimizations during build (no plugins).
+            static_batch (bool):
+                Build engine only for specified opt_batch_size.
+            static_shape (bool):
+                Build engine only for specified opt_image_height & opt_image_width. Default = True.
+            enable_preview (bool):
+                Enable TensorRT preview features.
+        """
+        # Build engines
+        for model_name, obj in self.models.items():
+            engine = Engine(model_name, engine_dir)
+            if force_build or not os.path.exists(engine.engine_path):
+                onnx_path = self.getModelPath(model_name, onnx_dir, opt=False)
+                onnx_opt_path = self.getModelPath(model_name, onnx_dir)
+                if not os.path.exists(onnx_opt_path):
+                    # Export onnx
+                    if force_export or not os.path.exists(onnx_path):
+                        print(f"Exporting model: {onnx_path}")
+                        model = obj.get_model()
+                        with torch.inference_mode(), torch.autocast("cuda"):
+                            inputs = obj.get_sample_input(opt_batch_size, opt_image_height, opt_image_width)
+                            torch.onnx.export(model,
+                                    inputs,
+                                    onnx_path,
+                                    export_params=True,
+                                    opset_version=onnx_opset,
+                                    do_constant_folding=True,
+                                    input_names = obj.get_input_names(),
+                                    output_names = obj.get_output_names(),
+                                    dynamic_axes=obj.get_dynamic_axes(),
+                            )
+                    else:
+                        print(f"Found cached model: {onnx_path}")
+                    # Optimize onnx
+                    if force_optimize or not os.path.exists(onnx_opt_path):
+                        print(f"Generating optimizing model: {onnx_opt_path}")
+                        onnx_opt_graph = obj.optimize(onnx.load(onnx_path), minimal_optimization=minimal_optimization)
+                        onnx.save(onnx_opt_graph, onnx_opt_path)
+                    else:
+                        print(f"Found cached optimized model: {onnx_opt_path} ")
+                # Build engine
+                engine.build(onnx_opt_path, fp16=True, \
+                    input_profile=obj.get_input_profile(opt_batch_size, opt_image_height, opt_image_width, \
+                        static_batch=static_batch, static_shape=static_shape), \
+                    enable_preview=enable_preview)
+            self.engine[model_name] = engine
+        # Separate iteration to activate engines
+        for model_name, obj in self.models.items():
+            self.engine[model_name].activate()
+    def loadModules(
+        self,
+    ):
+        self.tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
+        self.scheduler.set_timesteps(self.denoising_steps)
+        # Pre-compute latent input scales and linear multistep coefficients
+        self.scheduler.configure()
+    def runEngine(self, model_name, feed_dict):
+        engine = self.engine[model_name]
+        return engine.infer(feed_dict, self.stream)
+    def infer(
+        self,
+        prompt,
+        negative_prompt,
+        image_height,
+        image_width,
+        warmup = False,
+        verbose = False,
+    ):
+        """
+        Run the diffusion pipeline.
+        Args:
+            prompt (str):
+                The text prompt to guide image generation.
+            negative_prompt (str):
+                The prompt not to guide the image generation.
+            image_height (int):
+                Height (in pixels) of the image to be generated. Must be a multiple of 8.
+            image_width (int):
+                Width (in pixels) of the image to be generated. Must be a multiple of 8.
+            warmup (bool):
+                Indicate if this is a warmup run.
+            verbose (bool):
+                Enable verbose logging.
+        """
+        # Process inputs
+        batch_size = len(prompt)
+        assert len(prompt) == len(negative_prompt)
+        # Spatial dimensions of latent tensor
+        latent_height = image_height // 8
+        latent_width = image_width // 8
+        # Create profiling events
+        events = {}
+        for stage in ['clip', 'denoise', 'vae']:
+            for marker in ['start', 'stop']:
+                events[stage+'-'+marker] = cudart.cudaEventCreate()[1]
+        # Allocate buffers for TensorRT engine bindings
+        for model_name, obj in self.models.items():
+            self.engine[model_name].allocate_buffers(shape_dict=obj.get_shape_dict(batch_size, image_height, image_width), device=self.device)
+        generator = None
+        if args.seed is not None:
+            generator = torch.Generator(device="cuda").manual_seed(args.seed)
+        # Run Stable Diffusion pipeline
+        with torch.inference_mode(), torch.autocast("cuda"), trt.Runtime(TRT_LOGGER) as runtime:
+            # latents need to be generated on the target device
+            unet_channels = 4 # unet.in_channels
+            latents_shape = (batch_size * self.num_images, unet_channels, latent_height, latent_width)
+            latents_dtype = torch.float32 # text_embeddings.dtype
+            latents = torch.randn(latents_shape, device=self.device, dtype=latents_dtype, generator=generator)
+            # Scale the initial noise by the standard deviation required by the scheduler
+            latents = latents * self.scheduler.init_noise_sigma
+            torch.cuda.synchronize()
+            e2e_tic = time.perf_counter()
+            if self.nvtx_profile:
+                nvtx_clip = nvtx.start_range(message='clip', color='green')
+            cudart.cudaEventRecord(events['clip-start'], 0)
+            # Tokenize input
+            text_input_ids = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                return_tensors="pt",
+            ).input_ids.type(torch.int32).to(self.device)
+            # CLIP text encoder
+            text_input_ids_inp = cuda.DeviceView(ptr=text_input_ids.data_ptr(), shape=text_input_ids.shape, dtype=np.int32)
+            text_embeddings = self.runEngine('clip', {"input_ids": text_input_ids_inp})['text_embeddings']
+            # Duplicate text embeddings for each generation per prompt
+            bs_embed, seq_len, _ = text_embeddings.shape
+            text_embeddings = text_embeddings.repeat(1, self.num_images, 1)
+            text_embeddings = text_embeddings.view(bs_embed * self.num_images, seq_len, -1)
+            max_length = text_input_ids.shape[-1]
+            uncond_input_ids = self.tokenizer(
+                negative_prompt,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            ).input_ids.type(torch.int32).to(self.device)
+            uncond_input_ids_inp = cuda.DeviceView(ptr=uncond_input_ids.data_ptr(), shape=uncond_input_ids.shape, dtype=np.int32)
+            uncond_embeddings = self.runEngine('clip', {"input_ids": uncond_input_ids_inp})['text_embeddings']
+            # Duplicate unconditional embeddings for each generation per prompt
+            seq_len = uncond_embeddings.shape[1]
+            uncond_embeddings = uncond_embeddings.repeat(1, self.num_images, 1)
+            uncond_embeddings = uncond_embeddings.view(batch_size * self.num_images, seq_len, -1)
+            # Concatenate the unconditional and text embeddings into a single batch to avoid doing two forward passes for classifier free guidance
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+            if self.denoising_fp16:
+                text_embeddings = text_embeddings.to(dtype=torch.float16)
+            cudart.cudaEventRecord(events['clip-stop'], 0)
+            if self.nvtx_profile:
+                nvtx.end_range(nvtx_clip)
+            cudart.cudaEventRecord(events['denoise-start'], 0)
+            for step_index, timestep in enumerate(self.scheduler.timesteps):
+                if self.nvtx_profile:
+                    nvtx_latent_scale = nvtx.start_range(message='latent_scale', color='pink')
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2)
+                # LMSDiscreteScheduler.scale_model_input()
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, step_index)
+                if self.nvtx_profile:
+                    nvtx.end_range(nvtx_latent_scale)
+                # predict the noise residual
+                if self.nvtx_profile:
+                    nvtx_unet = nvtx.start_range(message='unet', color='blue')
+                dtype = np.float16 if self.denoising_fp16 else np.float32
+                if timestep.dtype != torch.float32:
+                    timestep_float = timestep.float()
+                else:
+                    timestep_float = timestep
+                sample_inp = cuda.DeviceView(ptr=latent_model_input.data_ptr(), shape=latent_model_input.shape, dtype=np.float32)
+                timestep_inp = cuda.DeviceView(ptr=timestep_float.data_ptr(), shape=timestep_float.shape, dtype=np.float32)
+                embeddings_inp = cuda.DeviceView(ptr=text_embeddings.data_ptr(), shape=text_embeddings.shape, dtype=dtype)
+                noise_pred = self.runEngine(self.unet_model_key, {"sample": sample_inp, "timestep": timestep_inp, "encoder_hidden_states": embeddings_inp})['latent']
+                if self.nvtx_profile:
+                    nvtx.end_range(nvtx_unet)
+                if self.nvtx_profile:
+                    nvtx_latent_step = nvtx.start_range(message='latent_step', color='pink')
+                # Perform guidance
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+                latents = self.scheduler.step(noise_pred, latents, step_index, timestep)
+                if self.nvtx_profile:
+                    nvtx.end_range(nvtx_latent_step)
+            latents = 1. / 0.18215 * latents
+            cudart.cudaEventRecord(events['denoise-stop'], 0)
+            if self.nvtx_profile:
+                nvtx_vae = nvtx.start_range(message='vae', color='red')
+            cudart.cudaEventRecord(events['vae-start'], 0)
+            sample_inp = cuda.DeviceView(ptr=latents.data_ptr(), shape=latents.shape, dtype=np.float32)
+            images = self.runEngine('vae', {"latent": sample_inp})['images']
+            cudart.cudaEventRecord(events['vae-stop'], 0)
+            if self.nvtx_profile:
+                nvtx.end_range(nvtx_vae)
+            torch.cuda.synchronize()
+            e2e_toc = time.perf_counter()
+            if not warmup:
+                print('|------------|--------------|')
+                print('| {:^10} | {:^12} |'.format('Module', 'Latency'))
+                print('|------------|--------------|')
+                print('| {:^10} | {:>9.2f} ms |'.format('CLIP', cudart.cudaEventElapsedTime(events['clip-start'], events['clip-stop'])[1]))
+                print('| {:^10} | {:>9.2f} ms |'.format('UNet x '+str(self.denoising_steps), cudart.cudaEventElapsedTime(events['denoise-start'], events['denoise-stop'])[1]))
+                print('| {:^10} | {:>9.2f} ms |'.format('VAE', cudart.cudaEventElapsedTime(events['vae-start'], events['vae-stop'])[1]))
+                print('|------------|--------------|')
+                print('| {:^10} | {:>9.2f} ms |'.format('Pipeline', (e2e_toc - e2e_tic)*1000.))
+                print('|------------|--------------|')
+                # Save image
+                image_name_prefix = 'sd-'+('fp16' if self.denoising_fp16 else 'fp32')+''.join(set(['-'+prompt[i].replace(' ','_')[:10] for i in range(batch_size)]))+'-'
+                save_image(images, self.output_dir, image_name_prefix)
+if __name__ == "__main__":
+    print("[I] Initializing StableDiffusion demo with TensorRT Plugins")
+    args = parseArgs()
+    # Process prompt
+    if not isinstance(args.prompt, list):
+        raise ValueError(f"`prompt` must be of type `str` or `str` list, but is {type(args.prompt)}")
+    prompt = args.prompt * args.repeat_prompt
+    if not isinstance(args.negative_prompt, list):
+        raise ValueError(f"`--negative-prompt` must be of type `str` or `str` list, but is {type(args.negative_prompt)}")
+    if len(args.negative_prompt) == 1:
+        negative_prompt = args.negative_prompt * len(prompt)
+    else:
+        negative_prompt = args.negative_prompt
+    max_batch_size = 16
+    if args.build_dynamic_shape:
+        max_batch_size = 4
+    if len(prompt) > max_batch_size:
+        raise ValueError(f"Batch size {len(prompt)} is larger than allowed {max_batch_size}. If dynamic shape is used, then maximum batch size is 4")
+    # Validate image dimensions
+    image_height = args.height
+    image_width = args.width
+    if image_height % 8 != 0 or image_width % 8 != 0:
+        raise ValueError(f"Image height and width have to be divisible by 8 but specified as: {image_height} and {image_width}.")
+    # Register TensorRT plugins
+    trt.init_libnvinfer_plugins(TRT_LOGGER, '')
+    # Initialize demo
+    demo = DemoDiffusion(
+        denoising_steps=args.denoising_steps,
+        denoising_fp16=(args.denoising_prec == 'fp16'),
+        output_dir=args.output_dir,
+        scheduler=args.scheduler,
+        hf_token=args.hf_token,
+        verbose=args.verbose,
+        nvtx_profile=args.nvtx_profile,
+        max_batch_size=max_batch_size)
+    # Load TensorRT engines and pytorch modules
+    demo.loadEngines(args.engine_dir, args.onnx_dir, args.onnx_opset,
+        opt_batch_size=len(prompt), opt_image_height=image_height, opt_image_width=image_width, \
+        force_export=args.force_onnx_export, force_optimize=args.force_onnx_optimize, \
+        force_build=args.force_engine_build, minimal_optimization=args.onnx_minimal_optimization, \
+        static_batch=args.build_static_batch, static_shape=not args.build_dynamic_shape, \
+        enable_preview=args.build_preview_features)
+    demo.loadModules()
+    print("[I] Warming up ..")
+    for _ in range(args.num_warmup_runs):
+        images = demo.infer(prompt, negative_prompt, image_height, image_width, warmup=True, verbose=False)
+    print("[I] Running StableDiffusion pipeline")
+    if args.nvtx_profile:
+        cudart.cudaProfilerStart()
+    images = demo.infer(prompt, negative_prompt, image_height, image_width, verbose=args.verbose)
+    if args.nvtx_profile:
+        cudart.cudaProfilerStop()
+    demo.teardown()

engine/clip.plan ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:924b8fe294a4b892377f2088dbf05077b7b4ec39b81772adc83bf25e91b21ab0
+size 247775035

engine/unet_fp16.plan ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a86c92b493e73d8ce6630cd426a56fb9ed3b49136cbdbd706b3b5814b7b90c9b
+size 1722051918

engine/vae.plan ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ddc71137cdd256dcc5b82c21b856ed25575168881782595e87a7b003534a4711
+size 99632486

models.py ADDED Viewed

	@@ -0,0 +1,980 @@

+#
+# SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from collections import OrderedDict
+from copy import deepcopy
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+import numpy as np
+from onnx import shape_inference
+import onnx_graphsurgeon as gs
+from polygraphy.backend.onnx.loader import fold_constants
+import torch
+from transformers import CLIPTextModel
+from cuda import cudart
+class Optimizer():
+    def __init__(
+        self,
+        onnx_graph,
+        verbose=False
+    ):
+        self.graph = gs.import_onnx(onnx_graph)
+        self.verbose = verbose
+    def info(self, prefix=''):
+        if self.verbose:
+            print(f"{prefix} .. {len(self.graph.nodes)} nodes, {len(self.graph.tensors().keys())} tensors, {len(self.graph.inputs)} inputs, {len(self.graph.outputs)} outputs")
+    def cleanup(self, return_onnx=False):
+        self.graph.cleanup().toposort()
+        if return_onnx:
+            return gs.export_onnx(self.graph)
+    def select_outputs(self, keep, names=None):
+        self.graph.outputs = [self.graph.outputs[o] for o in keep]
+        if names:
+            for i, name in enumerate(names):
+                self.graph.outputs[i].name = name
+    def fold_constants(self, return_onnx=False):
+        onnx_graph = fold_constants(gs.export_onnx(self.graph), allow_onnxruntime_shape_inference=True)
+        self.graph = gs.import_onnx(onnx_graph)
+        if return_onnx:
+            return onnx_graph
+    def infer_shapes(self, return_onnx=False):
+        onnx_graph = gs.export_onnx(self.graph)
+        if onnx_graph.ByteSize() > 2147483648:
+            raise TypeError("ERROR: model size exceeds supported 2GB limit")
+        else:
+            onnx_graph = shape_inference.infer_shapes(onnx_graph)
+        self.graph = gs.import_onnx(onnx_graph)
+        if return_onnx:
+            return onnx_graph
+    def remove_casts(self):
+        nRemoveCastNode = 0
+        for node in self.graph.nodes:
+            # Remove Cast nodes before qkv gemm
+            if node.op in ["Add", "Transpose"] and len(node.outputs[0].outputs) == 3 and node.o().op == "Cast" and node.o(1).op == "Cast" and node.o(2).op == "Cast":
+                for i in range(len(node.outputs[0].outputs)):
+                    matMulNode = node.o(i, 0).o()
+                    matMulNode.inputs[0] = node.outputs[0]
+                    nRemoveCastNode += 1
+            # Remove double cast nodes after Softmax Node
+            if node.op == "Softmax" and node.o().op == "Cast" and node.o().o().op == "Cast":
+                node.o().o().o().inputs[0] = node.outputs[0]
+                nRemoveCastNode += 1
+        self.cleanup()
+        return nRemoveCastNode
+    def remove_parallel_swish(self):
+        mRemoveSwishNode = 0
+        for node in self.graph.nodes:
+            if node.op == "Gemm" and len(node.outputs[0].outputs) > 6:
+                swishOutputTensor = None
+                for nextNode in node.outputs[0].outputs:
+                    if nextNode.op == "Mul":
+                        if swishOutputTensor is None:
+                            swishOutputTensor = nextNode.outputs[0]
+                        else:
+                            nextGemmNode = nextNode.o(0)
+                            assert nextGemmNode.op == "Gemm", "Unexpected node type for nextGemmNode {}".format(nextGemmNode.name)
+                            nextGemmNode.inputs = [swishOutputTensor, nextGemmNode.inputs[1], nextGemmNode.inputs[2]]
+                            nextNode.outputs.clear()
+                            mRemoveSwishNode += 1
+        self.cleanup()
+        return mRemoveSwishNode
+    def resize_fix(self):
+        '''
+        This function loops through the graph looking for Resize nodes that uses scales for resize (has 3 inputs).
+        It substitutes found Resize with Resize that takes the size of the output tensor instead of scales.
+        It adds Shape->Slice->Concat
+                Shape->Slice----^     subgraph to the graph to extract the shape of the output tensor.
+        This fix is required for the dynamic shape support.
+        '''
+        mResizeNodes = 0
+        for node in self.graph.nodes:
+            if node.op == "Resize" and len(node.inputs) == 3:
+                name = node.name + "/"
+                add_node = node.o().o().i(1)
+                div_node = node.i()
+                shape_hw_out = gs.Variable(name=name + "shape_hw_out", dtype=np.int64, shape=[4])
+                shape_hw = gs.Node(op="Shape", name=name+"shape_hw", inputs=[add_node.outputs[0]], outputs=[shape_hw_out])
+                const_zero = gs.Constant(name=name + "const_zero", values=np.array([0], dtype=np.int64))
+                const_two = gs.Constant(name=name + "const_two", values=np.array([2], dtype=np.int64))
+                const_four = gs.Constant(name=name + "const_four", values=np.array([4], dtype=np.int64))
+                slice_hw_out = gs.Variable(name=name + "slice_hw_out", dtype=np.int64, shape=[2])
+                slice_hw = gs.Node(op="Slice", name=name+"slice_hw", inputs=[shape_hw_out, const_two, const_four, const_zero], outputs=[slice_hw_out])
+                shape_bc_out = gs.Variable(name=name + "shape_bc_out", dtype=np.int64, shape=[2])
+                shape_bc = gs.Node(op="Shape", name=name+"shape_bc", inputs=[div_node.outputs[0]], outputs=[shape_bc_out])
+                slice_bc_out = gs.Variable(name=name + "slice_bc_out", dtype=np.int64, shape=[2])
+                slice_bc = gs.Node(op="Slice", name=name+"slice_bc", inputs=[shape_bc_out, const_zero, const_two, const_zero], outputs=[slice_bc_out])
+                concat_bchw_out = gs.Variable(name=name + "concat_bchw_out", dtype=np.int64, shape=[4])
+                concat_bchw = gs.Node(op="Concat", name=name+"concat_bchw", attrs={"axis": 0}, inputs=[slice_bc_out, slice_hw_out], outputs=[concat_bchw_out])
+                none_var = gs.Variable.empty()
+                resize_bchw = gs.Node(op="Resize", name=name+"resize_bchw", attrs=node.attrs, inputs=[node.inputs[0], none_var, none_var, concat_bchw_out], outputs=[node.outputs[0]])
+                self.graph.nodes.extend([shape_hw, slice_hw, shape_bc, slice_bc, concat_bchw, resize_bchw])
+                node.inputs = []
+                node.outputs = []
+                mResizeNodes += 1
+        self.cleanup()
+        return mResizeNodes
+    def adjustAddNode(self):
+        nAdjustAddNode = 0
+        for node in self.graph.nodes:
+            # Change the bias const to the second input to allow Gemm+BiasAdd fusion in TRT.
+            if node.op in ["Add"] and isinstance(node.inputs[0], gs.ir.tensor.Constant):
+                tensor = node.inputs[1]
+                bias = node.inputs[0]
+                node.inputs = [tensor, bias]
+                nAdjustAddNode += 1
+        self.cleanup()
+        return nAdjustAddNode
+    def decompose_instancenorms(self):
+        nRemoveInstanceNorm = 0
+        for node in self.graph.nodes:
+            if node.op == "InstanceNormalization":
+                name = node.name + "/"
+                input_tensor = node.inputs[0]
+                output_tensor = node.outputs[0]
+                mean_out = gs.Variable(name=name + "mean_out")
+                mean_node = gs.Node(op="ReduceMean", name=name + "mean_node", attrs={"axes": [-1]}, inputs=[input_tensor], outputs=[mean_out])
+                sub_out = gs.Variable(name=name + "sub_out")
+                sub_node = gs.Node(op="Sub", name=name + "sub_node", attrs={}, inputs=[input_tensor, mean_out], outputs=[sub_out])
+                pow_out = gs.Variable(name=name + "pow_out")
+                pow_const = gs.Constant(name=name + "pow_const", values=np.array([2.0], dtype=np.float32))
+                pow_node = gs.Node(op="Pow", name=name + "pow_node", attrs={}, inputs=[sub_out, pow_const], outputs=[pow_out])
+                mean2_out = gs.Variable(name=name + "mean2_out")
+                mean2_node = gs.Node(op="ReduceMean", name=name + "mean2_node", attrs={"axes": [-1]}, inputs=[pow_out], outputs=[mean2_out])
+                epsilon_out = gs.Variable(name=name + "epsilon_out")
+                epsilon_const = gs.Constant(name=name + "epsilon_const", values=np.array([node.attrs["epsilon"]], dtype=np.float32))
+                epsilon_node = gs.Node(op="Add", name=name + "epsilon_node", attrs={}, inputs=[mean2_out, epsilon_const], outputs=[epsilon_out])
+                sqrt_out = gs.Variable(name=name + "sqrt_out")
+                sqrt_node = gs.Node(op="Sqrt", name=name + "sqrt_node", attrs={}, inputs=[epsilon_out], outputs=[sqrt_out])
+                div_out = gs.Variable(name=name + "div_out")
+                div_node = gs.Node(op="Div", name=name + "div_node", attrs={}, inputs=[sub_out, sqrt_out], outputs=[div_out])
+                constantScale = gs.Constant("InstanceNormScaleV-" + str(nRemoveInstanceNorm), np.ascontiguousarray(node.inputs[1].inputs[0].attrs["value"].values.reshape(1, 32, 1)))
+                constantBias = gs.Constant("InstanceBiasV-" + str(nRemoveInstanceNorm), np.ascontiguousarray(node.inputs[2].inputs[0].attrs["value"].values.reshape(1, 32, 1)))
+                mul_out = gs.Variable(name=name + "mul_out")
+                mul_node = gs.Node(op="Mul", name=name + "mul_node", attrs={}, inputs=[div_out, constantScale], outputs=[mul_out])
+                add_node = gs.Node(op="Add", name=name + "add_node", attrs={}, inputs=[mul_out, constantBias], outputs=[output_tensor])
+                self.graph.nodes.extend([mean_node, sub_node, pow_node, mean2_node, epsilon_node, sqrt_node, div_node, mul_node, add_node])
+                node.inputs = []
+                node.outputs = []
+                nRemoveInstanceNorm += 1
+        self.cleanup()
+        return nRemoveInstanceNorm
+    def insert_groupnorm_plugin(self):
+        nGroupNormPlugin = 0
+        for node in self.graph.nodes:
+            if node.op == "Reshape" and node.outputs != [] and \
+                node.o().op == "ReduceMean" and node.o(1).op == "Sub" and node.o().o() == node.o(1) and \
+                node.o().o().o().o().o().o().o().o().o().o().o().op == "Mul" and \
+                node.o().o().o().o().o().o().o().o().o().o().o().o().op == "Add" and \
+                len(node.o().o().o().o().o().o().o().o().inputs[1].values.shape) == 3:
+                # "node.outputs != []" is added for VAE
+                inputTensor = node.i().inputs[0]
+                gammaNode = node.o().o().o().o().o().o().o().o().o().o().o()
+                index = [type(i) == gs.ir.tensor.Constant for i in gammaNode.inputs].index(True)
+                gamma = np.array(deepcopy(gammaNode.inputs[index].values.tolist()), dtype=np.float32)
+                constantGamma = gs.Constant("groupNormGamma-" + str(nGroupNormPlugin), np.ascontiguousarray(gamma.reshape(-1)))  # MUST use np.ascontiguousarray, or TRT will regard the shape of this Constant as (0) !!!
+                betaNode = gammaNode.o()
+                index = [type(i) == gs.ir.tensor.Constant for i in betaNode.inputs].index(True)
+                beta = np.array(deepcopy(betaNode.inputs[index].values.tolist()), dtype=np.float32)
+                constantBeta = gs.Constant("groupNormBeta-" + str(nGroupNormPlugin), np.ascontiguousarray(beta.reshape(-1)))
+                epsilon = node.o().o().o().o().o().inputs[1].values.tolist()[0]
+                if betaNode.o().op == "Sigmoid":  # need Swish
+                    bSwish = True
+                    lastNode = betaNode.o().o()  # Mul node of Swish
+                else:
+                    bSwish = False
+                    lastNode = betaNode  # Cast node after Group Norm
+                if lastNode.o().op == "Cast":
+                    lastNode = lastNode.o()
+                inputList = [inputTensor, constantGamma, constantBeta]
+                groupNormV = gs.Variable("GroupNormV-" + str(nGroupNormPlugin), np.dtype(np.float16), inputTensor.shape)
+                groupNormN = gs.Node("GroupNorm", "GroupNormN-" + str(nGroupNormPlugin), inputs=inputList, outputs=[groupNormV], attrs=OrderedDict([('epsilon', epsilon), ('bSwish', int(bSwish))]))
+                self.graph.nodes.append(groupNormN)
+                for subNode in self.graph.nodes:
+                    if lastNode.outputs[0] in subNode.inputs:
+                        index = subNode.inputs.index(lastNode.outputs[0])
+                        subNode.inputs[index] = groupNormV
+                node.i().inputs = []
+                lastNode.outputs = []
+                nGroupNormPlugin += 1
+        self.cleanup()
+        return nGroupNormPlugin
+    def insert_layernorm_plugin(self):
+        nLayerNormPlugin = 0
+        for node in self.graph.nodes:
+            if node.op == 'ReduceMean' and \
+                node.o().op == 'Sub' and node.o().inputs[0] == node.inputs[0] and \
+                node.o().o(0).op =='Pow' and node.o().o(1).op =='Div' and \
+                node.o().o(0).o().op == 'ReduceMean' and \
+                node.o().o(0).o().o().op == 'Add' and \
+                node.o().o(0).o().o().o().op == 'Sqrt' and \
+                node.o().o(0).o().o().o().o().op == 'Div' and node.o().o(0).o().o().o().o() == node.o().o(1) and \
+                node.o().o(0).o().o().o().o().o().op == 'Mul' and \
+                node.o().o(0).o().o().o().o().o().o().op == 'Add' and \
+                len(node.o().o(0).o().o().o().o().o().inputs[1].values.shape) == 1:
+                if node.i().op == "Add":
+                    inputTensor = node.inputs[0]  # CLIP
+                else:
+                    inputTensor = node.i().inputs[0]  # UNet and VAE
+                gammaNode = node.o().o().o().o().o().o().o()
+                index = [type(i) == gs.ir.tensor.Constant for i in gammaNode.inputs].index(True)
+                gamma = np.array(deepcopy(gammaNode.inputs[index].values.tolist()), dtype=np.float32)
+                constantGamma = gs.Constant("LayerNormGamma-" + str(nLayerNormPlugin), np.ascontiguousarray(gamma.reshape(-1)))  # MUST use np.ascontiguousarray, or TRT will regard the shape of this Constant as (0) !!!
+                betaNode = gammaNode.o()
+                index = [type(i) == gs.ir.tensor.Constant for i in betaNode.inputs].index(True)
+                beta = np.array(deepcopy(betaNode.inputs[index].values.tolist()), dtype=np.float32)
+                constantBeta = gs.Constant("LayerNormBeta-" + str(nLayerNormPlugin), np.ascontiguousarray(beta.reshape(-1)))
+                inputList = [inputTensor, constantGamma, constantBeta]
+                layerNormV = gs.Variable("LayerNormV-" + str(nLayerNormPlugin), np.dtype(np.float32), inputTensor.shape)
+                layerNormN = gs.Node("LayerNorm", "LayerNormN-" + str(nLayerNormPlugin), inputs=inputList, attrs=OrderedDict([('epsilon', 1.e-5)]), outputs=[layerNormV])
+                self.graph.nodes.append(layerNormN)
+                nLayerNormPlugin += 1
+                if betaNode.outputs[0] in self.graph.outputs:
+                    index = self.graph.outputs.index(betaNode.outputs[0])
+                    self.graph.outputs[index] = layerNormV
+                else:
+                    if betaNode.o().op == "Cast":
+                        lastNode = betaNode.o()
+                    else:
+                        lastNode = betaNode
+                    for subNode in self.graph.nodes:
+                        if lastNode.outputs[0] in subNode.inputs:
+                            index = subNode.inputs.index(lastNode.outputs[0])
+                            subNode.inputs[index] = layerNormV
+                    lastNode.outputs = []
+        self.cleanup()
+        return nLayerNormPlugin
+    def insert_splitgelu_plugin(self):
+        nSplitGeLUPlugin = 0
+        for node in self.graph.nodes:
+            if node.op == "Erf":
+                inputTensor = node.i().i().i().outputs[0]
+                lastNode = node.o().o().o().o()
+                outputShape = inputTensor.shape
+                outputShape[2] = outputShape[2] // 2
+                splitGeLUV = gs.Variable("splitGeLUV-" + str(nSplitGeLUPlugin), np.dtype(np.float32), outputShape)
+                splitGeLUN = gs.Node("SplitGeLU", "splitGeLUN-" + str(nSplitGeLUPlugin), inputs=[inputTensor], outputs=[splitGeLUV])
+                self.graph.nodes.append(splitGeLUN)
+                for subNode in self.graph.nodes:
+                    if lastNode.outputs[0] in subNode.inputs:
+                        index = subNode.inputs.index(lastNode.outputs[0])
+                        subNode.inputs[index] = splitGeLUV
+                lastNode.outputs = []
+                nSplitGeLUPlugin += 1
+        self.cleanup()
+        return nSplitGeLUPlugin
+    def insert_seq2spatial_plugin(self):
+        nSeqLen2SpatialPlugin = 0
+        for node in self.graph.nodes:
+            if node.op == "Transpose" and node.o().op == "Conv":
+                transposeNode = node
+                reshapeNode = node.i()
+                assert reshapeNode.op == "Reshape", "Unexpected node type for reshapeNode {}".format(reshapeNode.name)
+                residualNode = reshapeNode.i(0)
+                assert residualNode.op == "Add", "Unexpected node type for residualNode {}".format(residualNode.name)
+                biasNode = residualNode.i(0)
+                assert biasNode.op == "Add", "Unexpected node type for biasNode {}".format(biasNode.name)
+                biasIndex = [type(i) == gs.ir.tensor.Constant for i in biasNode.inputs].index(True)
+                bias = np.array(deepcopy(biasNode.inputs[biasIndex].values.tolist()), dtype=np.float32)
+                biasInput = gs.Constant("AddAddSeqLen2SpatialBias-" + str(nSeqLen2SpatialPlugin), np.ascontiguousarray(bias.reshape(-1)))
+                inputIndex = 1 - biasIndex
+                inputTensor = biasNode.inputs[inputIndex]
+                residualInput = residualNode.inputs[1]
+                outputTensor = transposeNode.outputs[0]
+                outputShapeTensor = transposeNode.i().i().i(1).i(1).i(1).i().inputs[0]
+                seqLen2SpatialNode = gs.Node("SeqLen2Spatial", "AddAddSeqLen2Spatial-" + str(nSeqLen2SpatialPlugin),
+                    inputs=[inputTensor, biasInput, residualInput, outputShapeTensor], outputs=[outputTensor])
+                self.graph.nodes.append(seqLen2SpatialNode)
+                biasNode.inputs.clear()
+                transposeNode.outputs.clear()
+                nSeqLen2SpatialPlugin += 1
+        self.cleanup()
+        return nSeqLen2SpatialPlugin
+    def fuse_kv(self, node_k, node_v, fused_kv_idx, heads, num_dynamic=0):
+        # Get weights of K
+        weights_k = node_k.inputs[1].values
+        # Get weights of V
+        weights_v = node_v.inputs[1].values
+        # Input number of channels to K and V
+        C = weights_k.shape[0]
+        # Number of heads
+        H = heads
+        # Dimension per head
+        D = weights_k.shape[1] // H
+        # Concat and interleave weights such that the output of fused KV GEMM has [b, s_kv, h, 2, d] shape
+        weights_kv = np.dstack([weights_k.reshape(C, H, D), weights_v.reshape(C, H, D)]).reshape(C, 2 * H * D)
+        # K and V have the same input
+        input_tensor = node_k.inputs[0]
+        # K and V must have the same output which we feed into fmha plugin
+        output_tensor_k = node_k.outputs[0]
+        # Create tensor
+        constant_weights_kv = gs.Constant("Weights_KV_{}".format(fused_kv_idx), np.ascontiguousarray(weights_kv))
+        # Create fused KV node
+        fused_kv_node = gs.Node(op="MatMul", name="MatMul_KV_{}".format(fused_kv_idx), inputs=[input_tensor, constant_weights_kv], outputs=[output_tensor_k])
+        self.graph.nodes.append(fused_kv_node)
+        # Connect the output of fused node to the inputs of the nodes after K and V
+        node_v.o(num_dynamic).inputs[0] = output_tensor_k
+        node_k.o(num_dynamic).inputs[0] = output_tensor_k
+        for i in range(0,num_dynamic):
+            node_v.o().inputs.clear()
+            node_k.o().inputs.clear()
+        # Clear inputs and outputs of K and V to ge these nodes cleared
+        node_k.outputs.clear()
+        node_v.outputs.clear()
+        node_k.inputs.clear()
+        node_v.inputs.clear()
+        self.cleanup()
+        return fused_kv_node
+    def insert_fmhca(self, node_q, node_kv, final_tranpose, mhca_idx, heads, num_dynamic=0):
+        # Get inputs and outputs for the fMHCA plugin
+        # We take an output of reshape that follows the Q GEMM
+        output_q = node_q.o(num_dynamic).o().inputs[0]
+        output_kv = node_kv.o().inputs[0]
+        output_final_tranpose = final_tranpose.outputs[0]
+        # Clear the inputs of the nodes that follow the Q and KV GEMM
+        # to delete these subgraphs (it will be substituted by fMHCA plugin)
+        node_kv.outputs[0].outputs[0].inputs.clear()
+        node_kv.outputs[0].outputs[0].inputs.clear()
+        node_q.o(num_dynamic).o().inputs.clear()
+        for i in range(0,num_dynamic):
+            node_q.o(i).o().o(1).inputs.clear()
+        weights_kv = node_kv.inputs[1].values
+        dims_per_head = weights_kv.shape[1] // (heads * 2)
+        # Reshape dims
+        shape = gs.Constant("Shape_KV_{}".format(mhca_idx), np.ascontiguousarray(np.array([0, 0, heads, 2, dims_per_head], dtype=np.int64)))
+        # Reshape output tensor
+        output_reshape = gs.Variable("ReshapeKV_{}".format(mhca_idx), np.dtype(np.float16), None)
+        # Create fMHA plugin
+        reshape = gs.Node(op="Reshape", name="Reshape_{}".format(mhca_idx), inputs=[output_kv, shape], outputs=[output_reshape])
+        # Insert node
+        self.graph.nodes.append(reshape)
+        # Create fMHCA plugin
+        fmhca = gs.Node(op="fMHCA", name="fMHCA_{}".format(mhca_idx), inputs=[output_q, output_reshape], outputs=[output_final_tranpose])
+        # Insert node
+        self.graph.nodes.append(fmhca)
+        # Connect input of fMHCA to output of Q GEMM
+        node_q.o(num_dynamic).outputs[0] = output_q
+        if num_dynamic > 0:
+            reshape2_input1_out = gs.Variable("Reshape2_fmhca{}_out".format(mhca_idx), np.dtype(np.int64), None)
+            reshape2_input1_shape = gs.Node("Shape", "Reshape2_fmhca{}_shape".format(mhca_idx), inputs=[node_q.inputs[0]], outputs=[reshape2_input1_out])
+            self.graph.nodes.append(reshape2_input1_shape)
+            final_tranpose.o().inputs[1] = reshape2_input1_out
+        # Clear outputs of transpose to get this subgraph cleared
+        final_tranpose.outputs.clear()
+        self.cleanup()
+    def fuse_qkv(self, node_q, node_k, node_v, fused_qkv_idx, heads, num_dynamic=0):
+        # Get weights of Q
+        weights_q = node_q.inputs[1].values
+        # Get weights of K
+        weights_k = node_k.inputs[1].values
+        # Get weights of V
+        weights_v = node_v.inputs[1].values
+        # Input number of channels to Q, K and V
+        C = weights_k.shape[0]
+        # Number of heads
+        H = heads
+        # Hidden dimension per head
+        D = weights_k.shape[1] // H
+        # Concat and interleave weights such that the output of fused QKV GEMM has [b, s, h, 3, d] shape
+        weights_qkv = np.dstack([weights_q.reshape(C, H, D), weights_k.reshape(C, H, D), weights_v.reshape(C, H, D)]).reshape(C, 3 * H * D)
+        input_tensor = node_k.inputs[0]  # K and V have the same input
+        # Q, K and V must have the same output which we feed into fmha plugin
+        output_tensor_k = node_k.outputs[0]
+        # Concat and interleave weights such that the output of fused QKV GEMM has [b, s, h, 3, d] shape
+        constant_weights_qkv = gs.Constant("Weights_QKV_{}".format(fused_qkv_idx), np.ascontiguousarray(weights_qkv))
+        # Created a fused node
+        fused_qkv_node = gs.Node(op="MatMul", name="MatMul_QKV_{}".format(fused_qkv_idx), inputs=[input_tensor, constant_weights_qkv], outputs=[output_tensor_k])
+        self.graph.nodes.append(fused_qkv_node)
+        # Connect the output of the fused node to the inputs of the nodes after Q, K and V
+        node_q.o(num_dynamic).inputs[0] = output_tensor_k
+        node_k.o(num_dynamic).inputs[0] = output_tensor_k
+        node_v.o(num_dynamic).inputs[0] = output_tensor_k
+        for i in range(0,num_dynamic):
+            node_q.o().inputs.clear()
+            node_k.o().inputs.clear()
+            node_v.o().inputs.clear()
+        # Clear inputs and outputs of Q, K and V to ge these nodes cleared
+        node_q.outputs.clear()
+        node_k.outputs.clear()
+        node_v.outputs.clear()
+        node_q.inputs.clear()
+        node_k.inputs.clear()
+        node_v.inputs.clear()
+        self.cleanup()
+        return fused_qkv_node
+    def insert_fmha(self, node_qkv, final_tranpose, mha_idx, heads, num_dynamic=0):
+        # Get inputs and outputs for the fMHA plugin
+        output_qkv = node_qkv.o().inputs[0]
+        output_final_tranpose = final_tranpose.outputs[0]
+        # Clear the inputs of the nodes that follow the QKV GEMM
+        # to delete these subgraphs (it will be substituted by fMHA plugin)
+        node_qkv.outputs[0].outputs[2].inputs.clear()
+        node_qkv.outputs[0].outputs[1].inputs.clear()
+        node_qkv.outputs[0].outputs[0].inputs.clear()
+        weights_qkv = node_qkv.inputs[1].values
+        dims_per_head = weights_qkv.shape[1] // (heads * 3)
+        # Reshape dims
+        shape = gs.Constant("Shape_QKV_{}".format(mha_idx), np.ascontiguousarray(np.array([0, 0, heads, 3, dims_per_head], dtype=np.int64)))
+        # Reshape output tensor
+        output_shape = gs.Variable("ReshapeQKV_{}".format(mha_idx), np.dtype(np.float16), None)
+        # Create fMHA plugin
+        reshape = gs.Node(op="Reshape", name="Reshape_{}".format(mha_idx), inputs=[output_qkv, shape], outputs=[output_shape])
+        # Insert node
+        self.graph.nodes.append(reshape)
+        # Create fMHA plugin
+        fmha = gs.Node(op="fMHA_V2", name="fMHA_{}".format(mha_idx), inputs=[output_shape], outputs=[output_final_tranpose])
+        # Insert node
+        self.graph.nodes.append(fmha)
+        if num_dynamic > 0:
+            reshape2_input1_out = gs.Variable("Reshape2_{}_out".format(mha_idx), np.dtype(np.int64), None)
+            reshape2_input1_shape = gs.Node("Shape", "Reshape2_{}_shape".format(mha_idx), inputs=[node_qkv.inputs[0]], outputs=[reshape2_input1_out])
+            self.graph.nodes.append(reshape2_input1_shape)
+            final_tranpose.o().inputs[1] = reshape2_input1_out
+        # Clear outputs of transpose to get this subgraph cleared
+        final_tranpose.outputs.clear()
+        self.cleanup()
+    def mha_mhca_detected(self, node, mha):
+        # Go from V GEMM down to the S*V MatMul and all way up to K GEMM
+        # If we are looking for MHCA inputs of two matmuls (K and V) must be equal.
+        # If we are looking for MHA inputs (K and V) must be not equal.
+        if node.op == "MatMul" and len(node.outputs) == 1 and \
+            ((mha and len(node.inputs[0].inputs) > 0  and node.i().op == "Add") or \
+            (not mha and len(node.inputs[0].inputs) == 0)):
+            if node.o().op == 'Shape':
+                if node.o(1).op == 'Shape':
+                    num_dynamic_kv = 3 if node.o(2).op == 'Shape' else 2
+                else:
+                    num_dynamic_kv = 1
+                # For Cross-Attention, if batch axis is dynamic (in QKV), assume H*W (in Q) is dynamic as well
+                num_dynamic_q = num_dynamic_kv if mha else num_dynamic_kv + 1
+            else:
+                num_dynamic_kv = 0
+                num_dynamic_q = 0
+            o = node.o(num_dynamic_kv)
+            if o.op == "Reshape" and \
+                o.o().op == "Transpose" and \
+                o.o().o().op == "Reshape" and \
+                o.o().o().o().op == "MatMul" and \
+                o.o().o().o().i(0).op == "Softmax" and \
+                o.o().o().o().i(1).op == "Reshape" and \
+                o.o().o().o().i(0).i().op == "Mul" and \
+                o.o().o().o().i(0).i().i().op == "MatMul" and \
+                o.o().o().o().i(0).i().i().i(0).op == "Reshape" and \
+                o.o().o().o().i(0).i().i().i(1).op == "Transpose" and \
+                o.o().o().o().i(0).i().i().i(1).i().op == "Reshape" and \
+                o.o().o().o().i(0).i().i().i(1).i().i().op == "Transpose" and \
+                o.o().o().o().i(0).i().i().i(1).i().i().i().op == "Reshape" and \
+                o.o().o().o().i(0).i().i().i(1).i().i().i().i().op == "MatMul" and \
+                node.name != o.o().o().o().i(0).i().i().i(1).i().i().i().i().name:
+                # "len(node.outputs) == 1" to make sure we are not in the already fused node
+                node_q = o.o().o().o().i(0).i().i().i(0).i().i().i()
+                node_k = o.o().o().o().i(0).i().i().i(1).i().i().i().i()
+                node_v = node
+                final_tranpose = o.o().o().o().o(num_dynamic_q).o()
+                # Sanity check to make sure that the graph looks like expected
+                if node_q.op == "MatMul" and final_tranpose.op == "Transpose":
+                    return True, num_dynamic_q, num_dynamic_kv, node_q, node_k, node_v, final_tranpose
+        return False, 0, 0, None, None, None, None
+    def fuse_kv_insert_fmhca(self, heads, mhca_index, sm):
+        nodes = self.graph.nodes
+        # Iterate over graph and search for MHCA pattern
+        for idx, _ in enumerate(nodes):
+            # fMHCA can't be at the 2 last layers of the network. It is a guard from OOB
+            if idx + 1 > len(nodes) or idx + 2 > len(nodes):
+                continue
+            # Get anchor nodes for fusion and fMHCA plugin insertion if the MHCA is detected
+            detected, num_dynamic_q, num_dynamic_kv, node_q, node_k, node_v, final_tranpose = \
+                self.mha_mhca_detected(nodes[idx], mha=False)
+            if detected:
+                assert num_dynamic_q == 0 or num_dynamic_q == num_dynamic_kv + 1
+                # Skip the FMHCA plugin for SM75 except for when the dim per head is 40.
+                if sm == 75 and node_q.inputs[1].shape[1] // heads == 160:
+                    continue
+                # Fuse K and V GEMMS
+                node_kv = self.fuse_kv(node_k, node_v, mhca_index, heads, num_dynamic_kv)
+                # Insert fMHCA plugin
+                self.insert_fmhca(node_q, node_kv, final_tranpose, mhca_index, heads, num_dynamic_q)
+                return True
+        return False
+    def fuse_qkv_insert_fmha(self, heads, mha_index):
+        nodes = self.graph.nodes
+        # Iterate over graph and search for MHA pattern
+        for idx, _ in enumerate(nodes):
+            # fMHA can't be at the 2 last layers of the network. It is a guard from OOB
+            if idx + 1 > len(nodes) or idx + 2 > len(nodes):
+                continue
+            # Get anchor nodes for fusion and fMHA plugin insertion if the MHA is detected
+            detected, num_dynamic_q, num_dynamic_kv, node_q, node_k, node_v, final_tranpose = \
+                self.mha_mhca_detected(nodes[idx], mha=True)
+            if detected:
+                assert num_dynamic_q == num_dynamic_kv
+                # Fuse Q, K and V GEMMS
+                node_qkv = self.fuse_qkv(node_q, node_k, node_v, mha_index, heads, num_dynamic_kv)
+                # Insert fMHA plugin
+                self.insert_fmha(node_qkv, final_tranpose, mha_index, heads, num_dynamic_kv)
+                return True
+        return False
+    def insert_fmhca_plugin(self, num_heads, sm):
+        mhca_index = 0
+        while self.fuse_kv_insert_fmhca(num_heads, mhca_index, sm):
+            mhca_index += 1
+        return mhca_index
+    def insert_fmha_plugin(self, num_heads):
+        mha_index = 0
+        while self.fuse_qkv_insert_fmha(num_heads, mha_index):
+            mha_index += 1
+        return mha_index
+class BaseModel():
+    def __init__(
+        self,
+        hf_token,
+        text_maxlen=77,
+        embedding_dim=768,
+        fp16=False,
+        device='cuda',
+        verbose=True,
+        max_batch_size=16
+    ):
+        self.fp16 = fp16
+        self.device = device
+        self.verbose = verbose
+        self.hf_token = hf_token
+        # Defaults
+        self.text_maxlen = text_maxlen
+        self.embedding_dim = embedding_dim
+        self.min_batch = 1
+        self.max_batch = max_batch_size
+        self.min_latent_shape = 256 // 8  # min image resolution: 256x256
+        self.max_latent_shape = 1024 // 8 # max image resolution: 1024x1024
+    def get_model(self):
+        pass
+    def get_input_names(self):
+        pass
+    def get_output_names(self):
+        pass
+    def get_dynamic_axes(self):
+        return None
+    def get_sample_input(self, batch_size, image_height, image_width):
+        pass
+    def get_input_profile(self, batch_size, image_height, image_width, static_batch, static_shape):
+        return None
+    def get_shape_dict(self, batch_size, image_height, image_width):
+        return None
+    def optimize(self, onnx_graph, minimal_optimization=False):
+        return onnx_graph
+    def check_dims(self, batch_size, image_height, image_width):
+        assert batch_size >= self.min_batch and batch_size <= self.max_batch
+        assert image_height % 8 == 0 or image_width % 8 == 0
+        latent_height = image_height // 8
+        latent_width = image_width // 8
+        assert latent_height >= self.min_latent_shape and latent_height <= self.max_latent_shape
+        assert latent_width >= self.min_latent_shape and latent_width <= self.max_latent_shape
+        return (latent_height, latent_width)
+    def get_minmax_dims(self, batch_size, image_height, image_width, static_batch, static_shape):
+        min_batch = batch_size if static_batch else self.min_batch
+        max_batch = batch_size if static_batch else self.max_batch
+        latent_height = image_height // 8
+        latent_width = image_width // 8
+        min_latent_height = latent_height if static_shape else self.min_latent_shape
+        max_latent_height = latent_height if static_shape else self.max_latent_shape
+        min_latent_width = latent_width if static_shape else self.min_latent_shape
+        max_latent_width = latent_width if static_shape else self.max_latent_shape
+        return (min_batch, max_batch, min_latent_height, max_latent_height, min_latent_width, max_latent_width)
+class CLIP(BaseModel):
+    def get_model(self):
+        return CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14").to(self.device)
+    def get_input_names(self):
+        return ['input_ids']
+    def get_output_names(self):
+       return ['text_embeddings', 'pooler_output']
+    def get_dynamic_axes(self):
+        return {
+            'input_ids': {0: 'B'},
+            'text_embeddings': {0: 'B'}
+        }
+    def get_input_profile(self, batch_size, image_height, image_width, static_batch, static_shape):
+        self.check_dims(batch_size, image_height, image_width)
+        min_batch, max_batch, _, _, _, _ = self.get_minmax_dims(batch_size, image_height, image_width, static_batch, static_shape)
+        return {
+            'input_ids': [(min_batch, self.text_maxlen), (batch_size, self.text_maxlen), (max_batch, self.text_maxlen)]
+        }
+    def get_shape_dict(self, batch_size, image_height, image_width):
+        self.check_dims(batch_size, image_height, image_width)
+        return {
+            'input_ids': (batch_size, self.text_maxlen),
+            'text_embeddings': (batch_size, self.text_maxlen, self.embedding_dim)
+        }
+    def get_sample_input(self, batch_size, image_height, image_width):
+        self.check_dims(batch_size, image_height, image_width)
+        return torch.zeros(batch_size, self.text_maxlen, dtype=torch.int32, device=self.device)
+    def optimize(self, onnx_graph, minimal_optimization=False):
+        enable_optimization = not minimal_optimization
+        # Remove Cast Node to optimize Attention block
+        bRemoveCastNode = enable_optimization
+        # Insert LayerNormalization Plugin
+        bLayerNormPlugin = enable_optimization
+        opt = Optimizer(onnx_graph, verbose=self.verbose)
+        opt.info('CLIP: original')
+        opt.select_outputs([0]) # delete graph output#1
+        opt.cleanup()
+        opt.info('CLIP: remove output[1]')
+        opt.fold_constants()
+        opt.info('CLIP: fold constants')
+        opt.infer_shapes()
+        opt.info('CLIP: shape inference')
+        if bRemoveCastNode:
+            num_casts_removed = opt.remove_casts()
+            opt.info('CLIP: removed '+str(num_casts_removed)+' casts')
+        if bLayerNormPlugin:
+            num_layernorm_inserted = opt.insert_layernorm_plugin()
+            opt.info('CLIP: inserted '+str(num_layernorm_inserted)+' LayerNorm plugins')
+        opt.select_outputs([0], names=['text_embeddings']) # rename network output
+        opt_onnx_graph = opt.cleanup(return_onnx=True)
+        opt.info('CLIP: final')
+        return opt_onnx_graph
+class UNet(BaseModel):
+    def get_model(self):
+        model_opts = {'revision': 'fp16', 'torch_dtype': torch.float16} if self.fp16 else {}
+        return UNet2DConditionModel.from_pretrained("CompVis/stable-diffusion-v1-4",
+            subfolder="unet",
+            use_auth_token=self.hf_token,
+            **model_opts).to(self.device)
+    def get_input_names(self):
+        return ['sample', 'timestep', 'encoder_hidden_states']
+    def get_output_names(self):
+       return ['latent']
+    def get_dynamic_axes(self):
+        return {
+            'sample': {0: '2B', 2: 'H', 3: 'W'},
+            'encoder_hidden_states': {0: '2B'},
+            'latent': {0: '2B', 2: 'H', 3: 'W'}
+        }
+    def get_input_profile(self, batch_size, image_height, image_width, static_batch, static_shape):
+        latent_height, latent_width = self.check_dims(batch_size, image_height, image_width)
+        min_batch, max_batch, min_latent_height, max_latent_height, min_latent_width, max_latent_width = \
+            self.get_minmax_dims(batch_size, image_height, image_width, static_batch, static_shape)
+        return {
+            'sample': [(2*min_batch, 4, min_latent_height, min_latent_width), (2*batch_size, 4, latent_height, latent_width), (2*max_batch, 4, max_latent_height, max_latent_width)],
+            'encoder_hidden_states': [(2*min_batch, self.text_maxlen, self.embedding_dim), (2*batch_size, self.text_maxlen, self.embedding_dim), (2*max_batch, self.text_maxlen, self.embedding_dim)]
+        }
+    def get_shape_dict(self, batch_size, image_height, image_width):
+        latent_height, latent_width = self.check_dims(batch_size, image_height, image_width)
+        return {
+            'sample': (2*batch_size, 4, latent_height, latent_width),
+            'encoder_hidden_states': (2*batch_size, self.text_maxlen, self.embedding_dim),
+            'latent': (2*batch_size, 4, latent_height, latent_width)
+        }
+    def get_sample_input(self, batch_size, image_height, image_width):
+        latent_height, latent_width = self.check_dims(batch_size, image_height, image_width)
+        dtype = torch.float16 if self.fp16 else torch.float32
+        return (
+            torch.randn(2*batch_size, 4, latent_height, latent_width, dtype=torch.float32, device=self.device),
+            torch.tensor([1.], dtype=torch.float32, device=self.device),
+            torch.randn(2*batch_size, self.text_maxlen, self.embedding_dim, dtype=dtype, device=self.device)
+        )
+    def optimize(self, onnx_graph, minimal_optimization=False):
+        enable_optimization = not minimal_optimization
+        # Decompose InstanceNormalization into primitive Ops
+        bRemoveInstanceNorm = enable_optimization
+        # Remove Cast Node to optimize Attention block
+        bRemoveCastNode = enable_optimization
+        # Remove parallel Swish ops
+        bRemoveParallelSwish = enable_optimization
+        # Adjust the bias to be the second input to the Add ops
+        bAdjustAddNode = enable_optimization
+        # Change Resize node to take size instead of scale
+        bResizeFix = enable_optimization
+        # Common override for disabling all plugins below
+        bDisablePlugins = minimal_optimization
+        # Use multi-head attention Plugin
+        bMHAPlugin = True
+        # Use multi-head cross attention Plugin
+        bMHCAPlugin = True
+        # Insert GroupNormalization Plugin
+        bGroupNormPlugin = True
+        # Insert LayerNormalization Plugin
+        bLayerNormPlugin = True
+        # Insert Split+GeLU Plugin
+        bSplitGeLUPlugin = True
+        # Replace BiasAdd+ResidualAdd+SeqLen2Spatial with plugin
+        bSeqLen2SpatialPlugin = True
+        opt = Optimizer(onnx_graph, verbose=self.verbose)
+        opt.info('UNet: original')
+        if bRemoveInstanceNorm:
+            num_instancenorm_replaced = opt.decompose_instancenorms()
+            opt.info('UNet: replaced '+str(num_instancenorm_replaced)+' InstanceNorms')
+        if bRemoveCastNode:
+            num_casts_removed = opt.remove_casts()
+            opt.info('UNet: removed '+str(num_casts_removed)+' casts')
+        if bRemoveParallelSwish:
+            num_parallel_swish_removed = opt.remove_parallel_swish()
+            opt.info('UNet: removed '+str(num_parallel_swish_removed)+' parallel swish ops')
+        if bAdjustAddNode:
+            num_adjust_add = opt.adjustAddNode()
+            opt.info('UNet: adjusted '+str(num_adjust_add)+' adds')
+        if bResizeFix:
+            num_resize_fix = opt.resize_fix()
+            opt.info('UNet: fixed '+str(num_resize_fix)+' resizes')
+        opt.cleanup()
+        opt.info('UNet: cleanup')
+        opt.fold_constants()
+        opt.info('UNet: fold constants')
+        opt.infer_shapes()
+        opt.info('UNet: shape inference')
+        num_heads = 8
+        if bMHAPlugin and not bDisablePlugins:
+            num_fmha_inserted = opt.insert_fmha_plugin(num_heads)
+            opt.info('UNet: inserted '+str(num_fmha_inserted)+' fMHA plugins')
+        if bMHCAPlugin and not bDisablePlugins:
+            props = cudart.cudaGetDeviceProperties(0)[1]
+            sm = props.major * 10 + props.minor
+            num_fmhca_inserted = opt.insert_fmhca_plugin(num_heads, sm)
+            opt.info('UNet: inserted '+str(num_fmhca_inserted)+' fMHCA plugins')
+        if bGroupNormPlugin and not bDisablePlugins:
+            num_groupnorm_inserted = opt.insert_groupnorm_plugin()
+            opt.info('UNet: inserted '+str(num_groupnorm_inserted)+' GroupNorm plugins')
+        if bLayerNormPlugin and not bDisablePlugins:
+            num_layernorm_inserted = opt.insert_layernorm_plugin()
+            opt.info('UNet: inserted '+str(num_layernorm_inserted)+' LayerNorm plugins')
+        if bSplitGeLUPlugin and not bDisablePlugins:
+            num_splitgelu_inserted = opt.insert_splitgelu_plugin()
+            opt.info('UNet: inserted '+str(num_splitgelu_inserted)+' SplitGeLU plugins')
+        if bSeqLen2SpatialPlugin and not bDisablePlugins:
+            num_seq2spatial_inserted = opt.insert_seq2spatial_plugin()
+            opt.info('UNet: inserted '+str(num_seq2spatial_inserted)+' SeqLen2Spatial plugins')
+        onnx_opt_graph = opt.cleanup(return_onnx=True)
+        opt.info('UNet: final')
+        return onnx_opt_graph
+class VAE(BaseModel):
+    def get_model(self):
+        vae = AutoencoderKL.from_pretrained("CompVis/stable-diffusion-v1-4",
+            subfolder="vae",
+            use_auth_token=self.hf_token).to(self.device)
+        vae.forward = vae.decode
+        return vae
+    def get_input_names(self):
+        return ['latent']
+    def get_output_names(self):
+       return ['images']
+    def get_dynamic_axes(self):
+        return {
+            'latent': {0: 'B', 2: 'H', 3: 'W'},
+            'images': {0: 'B', 2: '8H', 3: '8W'}
+        }
+    def get_input_profile(self, batch_size, image_height, image_width, static_batch, static_shape):
+        latent_height, latent_width = self.check_dims(batch_size, image_height, image_width)
+        min_batch, max_batch, min_latent_height, max_latent_height, min_latent_width, max_latent_width = \
+            self.get_minmax_dims(batch_size, image_height, image_width, static_batch, static_shape)
+        return {
+            'latent': [(min_batch, 4, min_latent_height, min_latent_width), (batch_size, 4, latent_height, latent_width), (max_batch, 4, max_latent_height, max_latent_width)]
+        }
+    def get_shape_dict(self, batch_size, image_height, image_width):
+        latent_height, latent_width = self.check_dims(batch_size, image_height, image_width)
+        return {
+            'latent': (batch_size, 4, latent_height, latent_width),
+            'images': (batch_size, 3, image_height, image_width)
+        }
+    def get_sample_input(self, batch_size, image_height, image_width):
+        latent_height, latent_width = self.check_dims(batch_size, image_height, image_width)
+        return torch.randn(batch_size, 4, latent_height, latent_width, dtype=torch.float32, device=self.device)
+    def optimize(self, onnx_graph, minimal_optimization=False):
+        enable_optimization = not minimal_optimization
+        # Decompose InstanceNormalization into primitive Ops
+        bRemoveInstanceNorm = enable_optimization
+        # Remove Cast Node to optimize Attention block
+        bRemoveCastNode = enable_optimization
+        # Insert GroupNormalization Plugin
+        bGroupNormPlugin = enable_optimization
+        opt = Optimizer(onnx_graph, verbose=self.verbose)
+        opt.info('VAE: original')
+        if bRemoveInstanceNorm:
+            num_instancenorm_replaced = opt.decompose_instancenorms()
+            opt.info('VAE: replaced '+str(num_instancenorm_replaced)+' InstanceNorms')
+        if bRemoveCastNode:
+            num_casts_removed = opt.remove_casts()
+            opt.info('VAE: removed '+str(num_casts_removed)+' casts')
+        opt.cleanup()
+        opt.info('VAE: cleanup')
+        opt.fold_constants()
+        opt.info('VAE: fold constants')
+        opt.infer_shapes()
+        opt.info('VAE: shape inference')
+        if bGroupNormPlugin:
+            num_groupnorm_inserted = opt.insert_groupnorm_plugin()
+            opt.info('VAE: inserted '+str(num_groupnorm_inserted)+' GroupNorm plugins')
+        onnx_opt_graph = opt.cleanup(return_onnx=True)
+        opt.info('VAE: final')
+        return onnx_opt_graph

onnx/clip.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f07f42f288698f966fb8f35f42cab2f2e2454bcbb68baee9e57280b7686e3ace
+size 322361500

onnx/clip.opt.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aa2c5bb7df8c93150f9c962d9af8fd992fba3d3302697f1eee7bb442472be3f5
+size 322335606

onnx/unet_fp16.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:01773c0d9f04889be77e7165388a72fce12033d019150cb036f1e5d8b21c91ba
+size 1720130667

onnx/unet_fp16.opt.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b541324a30e9b8f25c777f315aedac2ef3078dc782287e3df8bf074ba822cb21
+size 1719727102

onnx/vae.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:706de829e4ead501ec0357d98e42fdd515abaf10b6a9e985f498d88e5657b573
+size 99088306

onnx/vae.opt.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c0a60a41554cf0fde832dd7494c8fb7b4485e8fecda8656c3678d5d57b97aa2
+size 99061557

requirements.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+colored
+cuda-python
+diffusers==0.7.2
+ftfy
+matplotlib
+nvtx
+onnx==1.12.0
+--extra-index-url https://pypi.ngc.nvidia.com
+onnx-graphsurgeon==0.3.25
+onnxruntime==1.13.1
+polygraphy==0.43.1
+scipy
+--extra-index-url https://download.pytorch.org/whl/cu116
+torch==1.12.0+cu116
+transformers==4.24.0

utilities.py ADDED Viewed

	@@ -0,0 +1,537 @@

+#
+# Copyright 2022 The HuggingFace Inc. team.
+# SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+from collections import OrderedDict
+from copy import copy
+import numpy as np
+import os
+import math
+from PIL import Image
+from polygraphy.backend.common import bytes_from_path
+from polygraphy.backend.trt import CreateConfig, Profile
+from polygraphy.backend.trt import engine_from_bytes, engine_from_network, network_from_onnx_path, save_engine
+from polygraphy.backend.trt import util as trt_util
+from polygraphy import cuda
+import random
+from scipy import integrate
+import tensorrt as trt
+import torch
+TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
+class Engine():
+    def __init__(
+        self,
+        model_name,
+        engine_dir,
+    ):
+        self.engine_path = os.path.join(engine_dir, model_name+'.plan')
+        self.engine = None
+        self.context = None
+        self.buffers = OrderedDict()
+        self.tensors = OrderedDict()
+    def __del__(self):
+        [buf.free() for buf in self.buffers.values() if isinstance(buf, cuda.DeviceArray) ]
+        del self.engine
+        del self.context
+        del self.buffers
+        del self.tensors
+    def build(self, onnx_path, fp16, input_profile=None, enable_preview=False):
+        print(f"Building TensorRT engine for {onnx_path}: {self.engine_path}")
+        p = Profile()
+        if input_profile:
+            for name, dims in input_profile.items():
+                assert len(dims) == 3
+                p.add(name, min=dims[0], opt=dims[1], max=dims[2])
+        preview_features = []
+        if enable_preview:
+            trt_version = [int(i) for i in trt.__version__.split(".")]
+            # FASTER_DYNAMIC_SHAPES_0805 should only be used for TRT 8.5.1 or above.
+            if trt_version[0] > 8 or \
+                (trt_version[0] == 8 and (trt_version[1] > 5 or (trt_version[1] == 5 and trt_version[2] >= 1))):
+                preview_features = [trt.PreviewFeature.FASTER_DYNAMIC_SHAPES_0805]
+        engine = engine_from_network(network_from_onnx_path(onnx_path), config=CreateConfig(fp16=fp16, profiles=[p],
+            preview_features=preview_features))
+        save_engine(engine, path=self.engine_path)
+    def activate(self):
+        print(f"Loading TensorRT engine: {self.engine_path}")
+        self.engine = engine_from_bytes(bytes_from_path(self.engine_path))
+        self.context = self.engine.create_execution_context()
+    def allocate_buffers(self, shape_dict=None, device='cuda'):
+        for idx in range(trt_util.get_bindings_per_profile(self.engine)):
+            binding = self.engine[idx]
+            if shape_dict and binding in shape_dict:
+                shape = shape_dict[binding]
+            else:
+                shape = self.engine.get_binding_shape(binding)
+            dtype = trt_util.np_dtype_from_trt(self.engine.get_binding_dtype(binding))
+            if self.engine.binding_is_input(binding):
+                self.context.set_binding_shape(idx, shape)
+            # Workaround to convert np dtype to torch
+            np_type_tensor = np.empty(shape=[], dtype=dtype)
+            torch_type_tensor = torch.from_numpy(np_type_tensor)
+            tensor = torch.empty(tuple(shape), dtype=torch_type_tensor.dtype).to(device=device)
+            self.tensors[binding] = tensor
+            self.buffers[binding] = cuda.DeviceView(ptr=tensor.data_ptr(), shape=shape, dtype=dtype)
+    def infer(self, feed_dict, stream):
+        start_binding, end_binding = trt_util.get_active_profile_bindings(self.context)
+        # shallow copy of ordered dict
+        device_buffers = copy(self.buffers)
+        for name, buf in feed_dict.items():
+            assert isinstance(buf, cuda.DeviceView)
+            device_buffers[name] = buf
+        bindings = [0] * start_binding + [buf.ptr for buf in device_buffers.values()]
+        noerror = self.context.execute_async_v2(bindings=bindings, stream_handle=stream.ptr)
+        if not noerror:
+            raise ValueError(f"ERROR: inference failed.")
+        return self.tensors
+class LMSDiscreteScheduler():
+    def __init__(
+        self,
+        device = 'cuda',
+        beta_start = 0.00085,
+        beta_end = 0.012,
+        num_train_timesteps = 1000,
+    ):
+        self.num_train_timesteps = num_train_timesteps
+        self.order = 4
+        self.beta_start = beta_start
+        self.beta_end = beta_end
+        betas = (torch.linspace(beta_start**0.5, beta_end**0.5, self.num_train_timesteps, dtype=torch.float32) ** 2)
+        alphas = 1.0 - betas
+        self.alphas_cumprod = torch.cumprod(alphas, dim=0)
+        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
+        sigmas = np.concatenate([sigmas[::-1], [0.0]]).astype(np.float32)
+        self.sigmas = torch.from_numpy(sigmas)
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = self.sigmas.max()
+        self.device = device
+    def set_timesteps(self, steps):
+        self.num_inference_steps = steps
+        timesteps = np.linspace(0, self.num_train_timesteps - 1, steps, dtype=float)[::-1].copy()
+        sigmas = np.array(((1 - self.alphas_cumprod) / self.alphas_cumprod) ** 0.5)
+        sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)
+        sigmas = np.concatenate([sigmas, [0.0]]).astype(np.float32)
+        self.sigmas = torch.from_numpy(sigmas).to(device=self.device)
+        # Move all timesteps to correct device beforehand
+        self.timesteps = torch.from_numpy(timesteps).to(device=self.device).float()
+        self.derivatives = []
+    def scale_model_input(self, sample: torch.FloatTensor, idx, *args, **kwargs) -> torch.FloatTensor:
+        return sample * self.latent_scales[idx]
+    def configure(self):
+        order = self.order
+        self.lms_coeffs = []
+        self.latent_scales = [1./((sigma**2 + 1) ** 0.5) for sigma in self.sigmas]
+        def get_lms_coefficient(order, t, current_order):
+            """
+            Compute a linear multistep coefficient.
+            """
+            def lms_derivative(tau):
+                prod = 1.0
+                for k in range(order):
+                    if current_order == k:
+                        continue
+                    prod *= (tau - self.sigmas[t - k]) / (self.sigmas[t - current_order] - self.sigmas[t - k])
+                return prod
+            integrated_coeff = integrate.quad(lms_derivative, self.sigmas[t], self.sigmas[t + 1], epsrel=1e-4)[0]
+            return integrated_coeff
+        for step_index in range(self.num_inference_steps):
+            order = min(step_index + 1, order)
+            self.lms_coeffs.append([get_lms_coefficient(order, step_index, curr_order) for curr_order in range(order)])
+    def step(self, output, latents, idx, timestep):
+        # compute the previous noisy sample x_t -> x_t-1
+        # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
+        sigma = self.sigmas[idx]
+        pred_original_sample = latents - sigma * output
+        # 2. Convert to an ODE derivative
+        derivative = (latents - pred_original_sample) / sigma
+        self.derivatives.append(derivative)
+        if len(self.derivatives) > self.order:
+            self.derivatives.pop(0)
+        # 3. Compute previous sample based on the derivatives path
+        prev_sample = latents + sum(
+            coeff * derivative for coeff, derivative in zip(self.lms_coeffs[idx], reversed(self.derivatives))
+        )
+        return prev_sample
+class DPMScheduler():
+    def __init__(
+        self,
+        beta_start = 0.00085,
+        beta_end = 0.012,
+        num_train_timesteps = 1000,
+        solver_order = 2,
+        predict_epsilon = True,
+        thresholding = False,
+        dynamic_thresholding_ratio = 0.995,
+        sample_max_value = 1.0,
+        algorithm_type = "dpmsolver++",
+        solver_type = "midpoint",
+        lower_order_final = True,
+        device = 'cuda',
+    ):
+        # this schedule is very specific to the latent diffusion model.
+        self.betas = (
+            torch.linspace(beta_start**0.5, beta_end**0.5, num_train_timesteps, dtype=torch.float32) ** 2
+        )
+        self.device = device
+        self.alphas = 1.0 - self.betas
+        self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
+        # Currently we only support VP-type noise schedule
+        self.alpha_t = torch.sqrt(self.alphas_cumprod)
+        self.sigma_t = torch.sqrt(1 - self.alphas_cumprod)
+        self.lambda_t = torch.log(self.alpha_t) - torch.log(self.sigma_t)
+        # standard deviation of the initial noise distribution
+        self.init_noise_sigma = 1.0
+        self.algorithm_type = algorithm_type
+        self.predict_epsilon = predict_epsilon
+        self.thresholding = thresholding
+        self.dynamic_thresholding_ratio = dynamic_thresholding_ratio
+        self.sample_max_value = sample_max_value
+        self.lower_order_final = lower_order_final
+        # settings for DPM-Solver
+        if algorithm_type not in ["dpmsolver", "dpmsolver++"]:
+            raise NotImplementedError(f"{algorithm_type} does is not implemented for {self.__class__}")
+        if solver_type not in ["midpoint", "heun"]:
+            raise NotImplementedError(f"{solver_type} does is not implemented for {self.__class__}")
+        # setable values
+        self.num_inference_steps = None
+        self.solver_order = solver_order
+        self.num_train_timesteps = num_train_timesteps
+        self.solver_type = solver_type
+        self.first_order_first_coef = []
+        self.first_order_second_coef = []
+        self.second_order_first_coef = []
+        self.second_order_second_coef = []
+        self.second_order_third_coef = []
+        self.third_order_first_coef = []
+        self.third_order_second_coef = []
+        self.third_order_third_coef = []
+        self.third_order_fourth_coef = []
+    def scale_model_input(self, sample: torch.FloatTensor, *args, **kwargs) -> torch.FloatTensor:
+        return sample
+    def configure(self):
+        lower_order_nums = 0
+        for step_index in range(self.num_inference_steps):
+            step_idx = step_index
+            timestep = self.timesteps[step_idx]
+            prev_timestep = 0 if step_idx == len(self.timesteps) - 1 else self.timesteps[step_idx + 1]
+            self.dpm_solver_first_order_coefs_precompute(timestep, prev_timestep)
+            timestep_list = [self.timesteps[step_index - 1], timestep]
+            self.multistep_dpm_solver_second_order_coefs_precompute(timestep_list, prev_timestep)
+            timestep_list = [self.timesteps[step_index - 2], self.timesteps[step_index - 1], timestep]
+            self.multistep_dpm_solver_third_order_coefs_precompute(timestep_list, prev_timestep)
+            if lower_order_nums < self.solver_order:
+                lower_order_nums += 1
+    def dpm_solver_first_order_coefs_precompute(self, timestep, prev_timestep):
+        lambda_t, lambda_s = self.lambda_t[prev_timestep], self.lambda_t[timestep]
+        alpha_t, alpha_s = self.alpha_t[prev_timestep], self.alpha_t[timestep]
+        sigma_t, sigma_s = self.sigma_t[prev_timestep], self.sigma_t[timestep]
+        h = lambda_t - lambda_s
+        if self.algorithm_type == "dpmsolver++":
+            self.first_order_first_coef.append(sigma_t / sigma_s)
+            self.first_order_second_coef.append(alpha_t * (torch.exp(-h) - 1.0))
+        elif self.algorithm_type == "dpmsolver":
+            self.first_order_first_coef.append(alpha_t / alpha_s)
+            self.first_order_second_coef.append(sigma_t * (torch.exp(h) - 1.0))
+    def multistep_dpm_solver_second_order_coefs_precompute(self, timestep_list, prev_timestep):
+        t, s0, s1 = prev_timestep, timestep_list[-1], timestep_list[-2]
+        lambda_t, lambda_s0, lambda_s1 = self.lambda_t[t], self.lambda_t[s0], self.lambda_t[s1]
+        alpha_t, alpha_s0 = self.alpha_t[t], self.alpha_t[s0]
+        sigma_t, sigma_s0 = self.sigma_t[t], self.sigma_t[s0]
+        h = lambda_t - lambda_s0
+        if self.algorithm_type == "dpmsolver++":
+            # See https://arxiv.org/abs/2211.01095 for detailed derivations
+            if self.solver_type == "midpoint":
+                self.second_order_first_coef.append(sigma_t / sigma_s0)
+                self.second_order_second_coef.append((alpha_t * (torch.exp(-h) - 1.0)))
+                self.second_order_third_coef.append(0.5 * (alpha_t * (torch.exp(-h) - 1.0)))
+            elif self.solver_type == "heun":
+                self.second_order_first_coef.append(sigma_t / sigma_s0)
+                self.second_order_second_coef.append((alpha_t * (torch.exp(-h) - 1.0)))
+                self.second_order_third_coef.append(alpha_t * ((torch.exp(-h) - 1.0) / h + 1.0))
+        elif self.algorithm_type == "dpmsolver":
+            # See https://arxiv.org/abs/2206.00927 for detailed derivations
+            if self.solver_type == "midpoint":
+                self.second_order_first_coef.append(alpha_t / alpha_s0)
+                self.second_order_second_coef.append((sigma_t * (torch.exp(h) - 1.0)))
+                self.second_order_third_coef.append(0.5 * (sigma_t * (torch.exp(h) - 1.0)))
+            elif self.solver_type == "heun":
+                self.second_order_first_coef.append(alpha_t / alpha_s0)
+                self.second_order_second_coef.append((sigma_t * (torch.exp(h) - 1.0)))
+                self.second_order_third_coef.append((sigma_t * ((torch.exp(h) - 1.0) / h - 1.0)))
+    def multistep_dpm_solver_third_order_coefs_precompute(self, timestep_list, prev_timestep):
+        t, s0 = prev_timestep, timestep_list[-1]
+        lambda_t, lambda_s0 = (
+            self.lambda_t[t],
+            self.lambda_t[s0]
+        )
+        alpha_t, alpha_s0 = self.alpha_t[t], self.alpha_t[s0]
+        sigma_t, sigma_s0 = self.sigma_t[t], self.sigma_t[s0]
+        h = lambda_t - lambda_s0
+        if self.algorithm_type == "dpmsolver++":
+            self.third_order_first_coef.append(sigma_t / sigma_s0)
+            self.third_order_second_coef.append(alpha_t * (torch.exp(-h) - 1.0))
+            self.third_order_third_coef.append(alpha_t * ((torch.exp(-h) - 1.0) / h + 1.0))
+            self.third_order_fourth_coef.append(alpha_t * ((torch.exp(-h) - 1.0 + h) / h**2 - 0.5))
+        elif self.algorithm_type == "dpmsolver":
+            self.third_order_first_coef.append(alpha_t / alpha_s0)
+            self.third_order_second_coef.append(sigma_t * (torch.exp(h) - 1.0))
+            self.third_order_third_coef.append(sigma_t * ((torch.exp(h) - 1.0) / h - 1.0))
+            self.third_order_fourth_coef.append(sigma_t * ((torch.exp(h) - 1.0 - h) / h**2 - 0.5))
+    def set_timesteps(self, num_inference_steps):
+        self.num_inference_steps = num_inference_steps
+        timesteps = (
+            np.linspace(0, self.num_train_timesteps - 1, num_inference_steps + 1)
+            .round()[::-1][:-1]
+            .copy()
+            .astype(np.int32)
+        )
+        self.timesteps = torch.from_numpy(timesteps).to(self.device)
+        self.model_outputs = [
+            None,
+        ] * self.solver_order
+        self.lower_order_nums = 0
+    def convert_model_output(
+        self, model_output, timestep, sample
+    ):
+        # DPM-Solver++ needs to solve an integral of the data prediction model.
+        if self.algorithm_type == "dpmsolver++":
+            if self.predict_epsilon:
+                alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
+                x0_pred = (sample - sigma_t * model_output) / alpha_t
+            else:
+                x0_pred = model_output
+            if self.thresholding:
+                # Dynamic thresholding in https://arxiv.org/abs/2205.11487
+                dynamic_max_val = torch.quantile(
+                    torch.abs(x0_pred).reshape((x0_pred.shape[0], -1)), self.dynamic_thresholding_ratio, dim=1
+                )
+                dynamic_max_val = torch.maximum(
+                    dynamic_max_val,
+                    self.sample_max_value * torch.ones_like(dynamic_max_val).to(dynamic_max_val.device),
+                )[(...,) + (None,) * (x0_pred.ndim - 1)]
+                x0_pred = torch.clamp(x0_pred, -dynamic_max_val, dynamic_max_val) / dynamic_max_val
+            return x0_pred
+        # DPM-Solver needs to solve an integral of the noise prediction model.
+        elif self.algorithm_type == "dpmsolver":
+            if self.predict_epsilon:
+                return model_output
+            else:
+                alpha_t, sigma_t = self.alpha_t[timestep], self.sigma_t[timestep]
+                epsilon = (sample - alpha_t * model_output) / sigma_t
+                return epsilon
+    def dpm_solver_first_order_update(
+        self,
+        idx,
+        model_output,
+        sample
+    ):
+        first_coef = self.first_order_first_coef[idx]
+        second_coef = self.first_order_second_coef[idx]
+        if self.algorithm_type == "dpmsolver++":
+            x_t = first_coef * sample - second_coef * model_output
+        elif self.algorithm_type == "dpmsolver":
+            x_t = first_coef * sample - second_coef * model_output
+        return x_t
+    def multistep_dpm_solver_second_order_update(
+        self,
+        idx,
+        model_output_list,
+        timestep_list,
+        prev_timestep,
+        sample
+    ):
+        t, s0, s1 = prev_timestep, timestep_list[-1], timestep_list[-2]
+        m0, m1 = model_output_list[-1], model_output_list[-2]
+        lambda_t, lambda_s0, lambda_s1 = self.lambda_t[t], self.lambda_t[s0], self.lambda_t[s1]
+        h, h_0 = lambda_t - lambda_s0, lambda_s0 - lambda_s1
+        r0 = h_0 / h
+        D0, D1 = m0, (1.0 / r0) * (m0 - m1)
+        first_coef = self.second_order_first_coef[idx]
+        second_coef = self.second_order_second_coef[idx]
+        third_coef = self.second_order_third_coef[idx]
+        if self.algorithm_type == "dpmsolver++":
+            # See https://arxiv.org/abs/2211.01095 for detailed derivations
+            if self.solver_type == "midpoint":
+                x_t = (
+                    first_coef * sample
+                    - second_coef * D0
+                    - third_coef * D1
+                )
+            elif self.solver_type == "heun":
+                x_t = (
+                    first_coef * sample
+                    - second_coef * D0
+                    + third_coef * D1
+                )
+        elif self.algorithm_type == "dpmsolver":
+            # See https://arxiv.org/abs/2206.00927 for detailed derivations
+            if self.solver_type == "midpoint":
+                x_t = (
+                    first_coef * sample
+                    - second_coef * D0
+                    - third_coef * D1
+                )
+            elif self.solver_type == "heun":
+                x_t = (
+                    first_coef * sample
+                    - second_coef * D0
+                    - third_coef * D1
+                )
+        return x_t
+    def multistep_dpm_solver_third_order_update(
+        self,
+        idx,
+        model_output_list,
+        timestep_list,
+        prev_timestep,
+        sample
+    ):
+        t, s0, s1, s2 = prev_timestep, timestep_list[-1], timestep_list[-2], timestep_list[-3]
+        m0, m1, m2 = model_output_list[-1], model_output_list[-2], model_output_list[-3]
+        lambda_t, lambda_s0, lambda_s1, lambda_s2 = (
+            self.lambda_t[t],
+            self.lambda_t[s0],
+            self.lambda_t[s1],
+            self.lambda_t[s2],
+        )
+        h, h_0, h_1 = lambda_t - lambda_s0, lambda_s0 - lambda_s1, lambda_s1 - lambda_s2
+        r0, r1 = h_0 / h, h_1 / h
+        D0 = m0
+        D1_0, D1_1 = (1.0 / r0) * (m0 - m1), (1.0 / r1) * (m1 - m2)
+        D1 = D1_0 + (r0 / (r0 + r1)) * (D1_0 - D1_1)
+        D2 = (1.0 / (r0 + r1)) * (D1_0 - D1_1)
+        first_coef = self.third_order_first_coef[idx]
+        second_coef = self.third_order_second_coef[idx]
+        third_coef = self.third_order_third_coef[idx]
+        fourth_coef = self.third_order_fourth_coef[idx]
+        if self.algorithm_type == "dpmsolver++":
+            # See https://arxiv.org/abs/2206.00927 for detailed derivations
+            x_t = (
+                first_coef * sample
+                - second_coef * D0
+                + third_coef * D1
+                - fourth_coef * D2
+            )
+        elif self.algorithm_type == "dpmsolver":
+            # See https://arxiv.org/abs/2206.00927 for detailed derivations
+            x_t = (
+                first_coef * sample
+                - second_coef * D0
+                - third_coef * D1
+                - fourth_coef * D2
+            )
+        return x_t
+    def step(self, output, latents, step_index, timestep):
+        if self.num_inference_steps is None:
+            raise ValueError(
+                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
+            )
+        prev_timestep = 0 if step_index == len(self.timesteps) - 1 else self.timesteps[step_index + 1]
+        lower_order_final = (
+            (step_index == len(self.timesteps) - 1) and self.lower_order_final and len(self.timesteps) < 15
+        )
+        lower_order_second = (
+            (step_index == len(self.timesteps) - 2) and self.lower_order_final and len(self.timesteps) < 15
+        )
+        output = self.convert_model_output(output, timestep, latents)
+        for i in range(self.solver_order - 1):
+            self.model_outputs[i] = self.model_outputs[i + 1]
+        self.model_outputs[-1] = output
+        if self.solver_order == 1 or self.lower_order_nums < 1 or lower_order_final:
+            prev_sample = self.dpm_solver_first_order_update(step_index, output, latents)
+        elif self.solver_order == 2 or self.lower_order_nums < 2 or lower_order_second:
+            timestep_list = [self.timesteps[step_index - 1], timestep]
+            prev_sample = self.multistep_dpm_solver_second_order_update(
+                step_index, self.model_outputs, timestep_list, prev_timestep, latents
+            )
+        else:
+            timestep_list = [self.timesteps[step_index - 2], self.timesteps[step_index - 1], timestep]
+            prev_sample = self.multistep_dpm_solver_third_order_update(
+                step_index, self.model_outputs, timestep_list, prev_timestep, latents
+            )
+        if self.lower_order_nums < self.solver_order:
+            self.lower_order_nums += 1
+        return prev_sample
+def save_image(images, image_path_dir, image_name_prefix):
+    """
+    Save the generated images to png files.
+    """
+    images = ((images + 1) * 255 / 2).clamp(0, 255).detach().permute(0, 2, 3, 1).round().type(torch.uint8).cpu().numpy()
+    for i in range(images.shape[0]):
+        image_path  = os.path.join(image_path_dir, image_name_prefix+str(i+1)+'-'+str(random.randint(1000,9999))+'.png')
+        print(f"Saving image {i+1} / {images.shape[0]} to: {image_path}")
+        Image.fromarray(images[i]).save(image_path)