Delete demo-diffusion.py
Browse files- demo-diffusion.py +0 -501
demo-diffusion.py
DELETED
|
@@ -1,501 +0,0 @@
|
|
| 1 |
-
#
|
| 2 |
-
# SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
| 3 |
-
# SPDX-License-Identifier: Apache-2.0
|
| 4 |
-
#
|
| 5 |
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
| 6 |
-
# you may not use this file except in compliance with the License.
|
| 7 |
-
# You may obtain a copy of the License at
|
| 8 |
-
#
|
| 9 |
-
# http://www.apache.org/licenses/LICENSE-2.0
|
| 10 |
-
#
|
| 11 |
-
# Unless required by applicable law or agreed to in writing, software
|
| 12 |
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
| 13 |
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
| 14 |
-
# See the License for the specific language governing permissions and
|
| 15 |
-
# limitations under the License.
|
| 16 |
-
#
|
| 17 |
-
|
| 18 |
-
import argparse
|
| 19 |
-
from cuda import cudart
|
| 20 |
-
from models import CLIP, UNet, VAE
|
| 21 |
-
import numpy as np
|
| 22 |
-
import nvtx
|
| 23 |
-
import os
|
| 24 |
-
import onnx
|
| 25 |
-
from polygraphy import cuda
|
| 26 |
-
import time
|
| 27 |
-
import torch
|
| 28 |
-
from transformers import CLIPTokenizer
|
| 29 |
-
import tensorrt as trt
|
| 30 |
-
from utilities import Engine, DPMScheduler, LMSDiscreteScheduler, save_image, TRT_LOGGER
|
| 31 |
-
|
| 32 |
-
def parseArgs():
|
| 33 |
-
parser = argparse.ArgumentParser(description="Options for Stable Diffusion Demo")
|
| 34 |
-
# Stable Diffusion configuration
|
| 35 |
-
parser.add_argument('prompt', nargs = '*', help="Text prompt(s) to guide image generation")
|
| 36 |
-
parser.add_argument('--negative-prompt', nargs = '*', default=[''], help="The negative prompt(s) to guide the image generation.")
|
| 37 |
-
parser.add_argument('--repeat-prompt', type=int, default=1, choices=[1, 2, 4, 8, 16], help="Number of times to repeat the prompt (batch size multiplier)")
|
| 38 |
-
parser.add_argument('--height', type=int, default=512, help="Height of image to generate (must be multiple of 8)")
|
| 39 |
-
parser.add_argument('--width', type=int, default=512, help="Height of image to generate (must be multiple of 8)")
|
| 40 |
-
parser.add_argument('--num-images', type=int, default=1, help="Number of images to generate per prompt")
|
| 41 |
-
parser.add_argument('--denoising-steps', type=int, default=50, help="Number of denoising steps")
|
| 42 |
-
parser.add_argument('--denoising-prec', type=str, default='fp16', choices=['fp32', 'fp16'], help="Denoiser model precision")
|
| 43 |
-
parser.add_argument('--scheduler', type=str, default="LMSD", choices=["LMSD", "DPM"], help="Scheduler for diffusion process")
|
| 44 |
-
|
| 45 |
-
# ONNX export
|
| 46 |
-
parser.add_argument('--onnx-opset', type=int, default=16, choices=range(7,18), help="Select ONNX opset version to target for exported models")
|
| 47 |
-
parser.add_argument('--onnx-dir', default='onnx', help="Output directory for ONNX export")
|
| 48 |
-
parser.add_argument('--force-onnx-export', action='store_true', help="Force ONNX export of CLIP, UNET, and VAE models")
|
| 49 |
-
parser.add_argument('--force-onnx-optimize', action='store_true', help="Force ONNX optimizations for CLIP, UNET, and VAE models")
|
| 50 |
-
parser.add_argument('--onnx-minimal-optimization', action='store_true', help="Restrict ONNX optimization to const folding and shape inference.")
|
| 51 |
-
|
| 52 |
-
# TensorRT engine build
|
| 53 |
-
parser.add_argument('--engine-dir', default='engine', help="Output directory for TensorRT engines")
|
| 54 |
-
parser.add_argument('--force-engine-build', action='store_true', help="Force rebuilding the TensorRT engine")
|
| 55 |
-
parser.add_argument('--build-static-batch', action='store_true', help="Build TensorRT engines with fixed batch size.")
|
| 56 |
-
parser.add_argument('--build-dynamic-shape', action='store_true', help="Build TensorRT engines with dynamic image shapes.")
|
| 57 |
-
parser.add_argument('--build-preview-features', action='store_true', help="Build TensorRT engines with preview features.")
|
| 58 |
-
|
| 59 |
-
# TensorRT inference
|
| 60 |
-
parser.add_argument('--num-warmup-runs', type=int, default=5, help="Number of warmup runs before benchmarking performance")
|
| 61 |
-
parser.add_argument('--nvtx-profile', action='store_true', help="Enable NVTX markers for performance profiling")
|
| 62 |
-
parser.add_argument('--seed', type=int, default=None, help="Seed for random generator to get consistent results")
|
| 63 |
-
|
| 64 |
-
parser.add_argument('--output-dir', default='output', help="Output directory for logs and image artifacts")
|
| 65 |
-
parser.add_argument('--hf-token', type=str, help="HuggingFace API access token for downloading model checkpoints")
|
| 66 |
-
parser.add_argument('-v', '--verbose', action='store_true', help="Show verbose output")
|
| 67 |
-
return parser.parse_args()
|
| 68 |
-
|
| 69 |
-
class DemoDiffusion:
|
| 70 |
-
"""
|
| 71 |
-
Application showcasing the acceleration of Stable Diffusion v1.4 pipeline using NVidia TensorRT w/ Plugins.
|
| 72 |
-
"""
|
| 73 |
-
def __init__(
|
| 74 |
-
self,
|
| 75 |
-
denoising_steps,
|
| 76 |
-
denoising_fp16=True,
|
| 77 |
-
scheduler="LMSD",
|
| 78 |
-
guidance_scale=7.5,
|
| 79 |
-
device='cuda',
|
| 80 |
-
output_dir='.',
|
| 81 |
-
hf_token=None,
|
| 82 |
-
verbose=False,
|
| 83 |
-
nvtx_profile=False,
|
| 84 |
-
max_batch_size=16
|
| 85 |
-
):
|
| 86 |
-
"""
|
| 87 |
-
Initializes the Diffusion pipeline.
|
| 88 |
-
|
| 89 |
-
Args:
|
| 90 |
-
denoising_steps (int):
|
| 91 |
-
The number of denoising steps.
|
| 92 |
-
More denoising steps usually lead to a higher quality image at the expense of slower inference.
|
| 93 |
-
denoising_fp16 (bool):
|
| 94 |
-
Run the denoising loop (UNet) in fp16 precision.
|
| 95 |
-
When enabled image quality will be lower but generally results in higher throughput.
|
| 96 |
-
guidance_scale (float):
|
| 97 |
-
Guidance scale is enabled by setting as > 1.
|
| 98 |
-
Higher guidance scale encourages to generate images that are closely linked to the text prompt, usually at the expense of lower image quality.
|
| 99 |
-
device (str):
|
| 100 |
-
PyTorch device to run inference. Default: 'cuda'
|
| 101 |
-
output_dir (str):
|
| 102 |
-
Output directory for log files and image artifacts
|
| 103 |
-
hf_token (str):
|
| 104 |
-
HuggingFace User Access Token to use for downloading Stable Diffusion model checkpoints.
|
| 105 |
-
verbose (bool):
|
| 106 |
-
Enable verbose logging.
|
| 107 |
-
nvtx_profile (bool):
|
| 108 |
-
Insert NVTX profiling markers.
|
| 109 |
-
max_batch_size (int):
|
| 110 |
-
Max batch size for dynamic batch engines.
|
| 111 |
-
"""
|
| 112 |
-
# Only supports single image per prompt.
|
| 113 |
-
self.num_images = 1
|
| 114 |
-
|
| 115 |
-
self.denoising_steps = denoising_steps
|
| 116 |
-
self.denoising_fp16 = denoising_fp16
|
| 117 |
-
assert guidance_scale > 1.0
|
| 118 |
-
self.guidance_scale = guidance_scale
|
| 119 |
-
|
| 120 |
-
self.output_dir = output_dir
|
| 121 |
-
self.hf_token = hf_token
|
| 122 |
-
self.device = device
|
| 123 |
-
self.verbose = verbose
|
| 124 |
-
self.nvtx_profile = nvtx_profile
|
| 125 |
-
|
| 126 |
-
# A scheduler to be used in combination with unet to denoise the encoded image latens.
|
| 127 |
-
# This demo uses an adaptation of LMSDiscreteScheduler or DPMScheduler:
|
| 128 |
-
sched_opts = {'num_train_timesteps': 1000, 'beta_start': 0.00085, 'beta_end': 0.012}
|
| 129 |
-
if scheduler == "DPM":
|
| 130 |
-
self.scheduler = DPMScheduler(device=self.device, **sched_opts)
|
| 131 |
-
elif scheduler == "LMSD":
|
| 132 |
-
self.scheduler = LMSDiscreteScheduler(device=self.device, **sched_opts)
|
| 133 |
-
else:
|
| 134 |
-
raise ValueError(f"Scheduler should be either DPM or LMSD")
|
| 135 |
-
|
| 136 |
-
self.tokenizer = None
|
| 137 |
-
|
| 138 |
-
self.unet_model_key = 'unet_fp16' if denoising_fp16 else 'unet'
|
| 139 |
-
self.models = {
|
| 140 |
-
'clip': CLIP(hf_token=hf_token, device=device, verbose=verbose, max_batch_size=max_batch_size),
|
| 141 |
-
self.unet_model_key: UNet(hf_token=hf_token, fp16=denoising_fp16, device=device, verbose=verbose, max_batch_size=max_batch_size),
|
| 142 |
-
'vae': VAE(hf_token=hf_token, device=device, verbose=verbose, max_batch_size=max_batch_size)
|
| 143 |
-
}
|
| 144 |
-
|
| 145 |
-
self.engine = {}
|
| 146 |
-
self.stream = cuda.Stream()
|
| 147 |
-
|
| 148 |
-
def teardown(self):
|
| 149 |
-
for engine in self.engine.values():
|
| 150 |
-
del engine
|
| 151 |
-
self.stream.free()
|
| 152 |
-
del self.stream
|
| 153 |
-
|
| 154 |
-
def getModelPath(self, name, onnx_dir, opt=True):
|
| 155 |
-
return os.path.join(onnx_dir, name+('.opt' if opt else '')+'.onnx')
|
| 156 |
-
|
| 157 |
-
def loadEngines(
|
| 158 |
-
self,
|
| 159 |
-
engine_dir,
|
| 160 |
-
onnx_dir,
|
| 161 |
-
onnx_opset,
|
| 162 |
-
opt_batch_size,
|
| 163 |
-
opt_image_height,
|
| 164 |
-
opt_image_width,
|
| 165 |
-
force_export=False,
|
| 166 |
-
force_optimize=False,
|
| 167 |
-
force_build=False,
|
| 168 |
-
minimal_optimization=False,
|
| 169 |
-
static_batch=False,
|
| 170 |
-
static_shape=True,
|
| 171 |
-
enable_preview=False,
|
| 172 |
-
):
|
| 173 |
-
"""
|
| 174 |
-
Build and load engines for TensorRT accelerated inference.
|
| 175 |
-
Export ONNX models first, if applicable.
|
| 176 |
-
|
| 177 |
-
Args:
|
| 178 |
-
engine_dir (str):
|
| 179 |
-
Directory to write the TensorRT engines.
|
| 180 |
-
onnx_dir (str):
|
| 181 |
-
Directory to write the ONNX models.
|
| 182 |
-
onnx_opset (int):
|
| 183 |
-
ONNX opset version to export the models.
|
| 184 |
-
opt_batch_size (int):
|
| 185 |
-
Batch size to optimize for during engine building.
|
| 186 |
-
opt_image_height (int):
|
| 187 |
-
Image height to optimize for during engine building. Must be a multiple of 8.
|
| 188 |
-
opt_image_width (int):
|
| 189 |
-
Image width to optimize for during engine building. Must be a multiple of 8.
|
| 190 |
-
force_export (bool):
|
| 191 |
-
Force re-exporting the ONNX models.
|
| 192 |
-
force_optimize (bool):
|
| 193 |
-
Force re-optimizing the ONNX models.
|
| 194 |
-
force_build (bool):
|
| 195 |
-
Force re-building the TensorRT engine.
|
| 196 |
-
minimal_optimization (bool):
|
| 197 |
-
Apply minimal optimizations during build (no plugins).
|
| 198 |
-
static_batch (bool):
|
| 199 |
-
Build engine only for specified opt_batch_size.
|
| 200 |
-
static_shape (bool):
|
| 201 |
-
Build engine only for specified opt_image_height & opt_image_width. Default = True.
|
| 202 |
-
enable_preview (bool):
|
| 203 |
-
Enable TensorRT preview features.
|
| 204 |
-
"""
|
| 205 |
-
|
| 206 |
-
# Build engines
|
| 207 |
-
for model_name, obj in self.models.items():
|
| 208 |
-
engine = Engine(model_name, engine_dir)
|
| 209 |
-
if force_build or not os.path.exists(engine.engine_path):
|
| 210 |
-
onnx_path = self.getModelPath(model_name, onnx_dir, opt=False)
|
| 211 |
-
onnx_opt_path = self.getModelPath(model_name, onnx_dir)
|
| 212 |
-
if not os.path.exists(onnx_opt_path):
|
| 213 |
-
# Export onnx
|
| 214 |
-
if force_export or not os.path.exists(onnx_path):
|
| 215 |
-
print(f"Exporting model: {onnx_path}")
|
| 216 |
-
model = obj.get_model()
|
| 217 |
-
with torch.inference_mode(), torch.autocast("cuda"):
|
| 218 |
-
inputs = obj.get_sample_input(opt_batch_size, opt_image_height, opt_image_width)
|
| 219 |
-
torch.onnx.export(model,
|
| 220 |
-
inputs,
|
| 221 |
-
onnx_path,
|
| 222 |
-
export_params=True,
|
| 223 |
-
opset_version=onnx_opset,
|
| 224 |
-
do_constant_folding=True,
|
| 225 |
-
input_names = obj.get_input_names(),
|
| 226 |
-
output_names = obj.get_output_names(),
|
| 227 |
-
dynamic_axes=obj.get_dynamic_axes(),
|
| 228 |
-
)
|
| 229 |
-
else:
|
| 230 |
-
print(f"Found cached model: {onnx_path}")
|
| 231 |
-
|
| 232 |
-
# Optimize onnx
|
| 233 |
-
if force_optimize or not os.path.exists(onnx_opt_path):
|
| 234 |
-
print(f"Generating optimizing model: {onnx_opt_path}")
|
| 235 |
-
onnx_opt_graph = obj.optimize(onnx.load(onnx_path), minimal_optimization=minimal_optimization)
|
| 236 |
-
onnx.save(onnx_opt_graph, onnx_opt_path)
|
| 237 |
-
else:
|
| 238 |
-
print(f"Found cached optimized model: {onnx_opt_path} ")
|
| 239 |
-
|
| 240 |
-
# Build engine
|
| 241 |
-
engine.build(onnx_opt_path, fp16=True, \
|
| 242 |
-
input_profile=obj.get_input_profile(opt_batch_size, opt_image_height, opt_image_width, \
|
| 243 |
-
static_batch=static_batch, static_shape=static_shape), \
|
| 244 |
-
enable_preview=enable_preview)
|
| 245 |
-
self.engine[model_name] = engine
|
| 246 |
-
|
| 247 |
-
# Separate iteration to activate engines
|
| 248 |
-
for model_name, obj in self.models.items():
|
| 249 |
-
self.engine[model_name].activate()
|
| 250 |
-
|
| 251 |
-
def loadModules(
|
| 252 |
-
self,
|
| 253 |
-
):
|
| 254 |
-
self.tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
|
| 255 |
-
self.scheduler.set_timesteps(self.denoising_steps)
|
| 256 |
-
# Pre-compute latent input scales and linear multistep coefficients
|
| 257 |
-
self.scheduler.configure()
|
| 258 |
-
|
| 259 |
-
def runEngine(self, model_name, feed_dict):
|
| 260 |
-
engine = self.engine[model_name]
|
| 261 |
-
return engine.infer(feed_dict, self.stream)
|
| 262 |
-
|
| 263 |
-
def infer(
|
| 264 |
-
self,
|
| 265 |
-
prompt,
|
| 266 |
-
negative_prompt,
|
| 267 |
-
image_height,
|
| 268 |
-
image_width,
|
| 269 |
-
warmup = False,
|
| 270 |
-
verbose = False,
|
| 271 |
-
):
|
| 272 |
-
"""
|
| 273 |
-
Run the diffusion pipeline.
|
| 274 |
-
|
| 275 |
-
Args:
|
| 276 |
-
prompt (str):
|
| 277 |
-
The text prompt to guide image generation.
|
| 278 |
-
negative_prompt (str):
|
| 279 |
-
The prompt not to guide the image generation.
|
| 280 |
-
image_height (int):
|
| 281 |
-
Height (in pixels) of the image to be generated. Must be a multiple of 8.
|
| 282 |
-
image_width (int):
|
| 283 |
-
Width (in pixels) of the image to be generated. Must be a multiple of 8.
|
| 284 |
-
warmup (bool):
|
| 285 |
-
Indicate if this is a warmup run.
|
| 286 |
-
verbose (bool):
|
| 287 |
-
Enable verbose logging.
|
| 288 |
-
"""
|
| 289 |
-
# Process inputs
|
| 290 |
-
batch_size = len(prompt)
|
| 291 |
-
assert len(prompt) == len(negative_prompt)
|
| 292 |
-
|
| 293 |
-
# Spatial dimensions of latent tensor
|
| 294 |
-
latent_height = image_height // 8
|
| 295 |
-
latent_width = image_width // 8
|
| 296 |
-
|
| 297 |
-
# Create profiling events
|
| 298 |
-
events = {}
|
| 299 |
-
for stage in ['clip', 'denoise', 'vae']:
|
| 300 |
-
for marker in ['start', 'stop']:
|
| 301 |
-
events[stage+'-'+marker] = cudart.cudaEventCreate()[1]
|
| 302 |
-
|
| 303 |
-
# Allocate buffers for TensorRT engine bindings
|
| 304 |
-
for model_name, obj in self.models.items():
|
| 305 |
-
self.engine[model_name].allocate_buffers(shape_dict=obj.get_shape_dict(batch_size, image_height, image_width), device=self.device)
|
| 306 |
-
|
| 307 |
-
generator = None
|
| 308 |
-
if args.seed is not None:
|
| 309 |
-
generator = torch.Generator(device="cuda").manual_seed(args.seed)
|
| 310 |
-
|
| 311 |
-
# Run Stable Diffusion pipeline
|
| 312 |
-
with torch.inference_mode(), torch.autocast("cuda"), trt.Runtime(TRT_LOGGER) as runtime:
|
| 313 |
-
# latents need to be generated on the target device
|
| 314 |
-
unet_channels = 4 # unet.in_channels
|
| 315 |
-
latents_shape = (batch_size * self.num_images, unet_channels, latent_height, latent_width)
|
| 316 |
-
latents_dtype = torch.float32 # text_embeddings.dtype
|
| 317 |
-
latents = torch.randn(latents_shape, device=self.device, dtype=latents_dtype, generator=generator)
|
| 318 |
-
|
| 319 |
-
# Scale the initial noise by the standard deviation required by the scheduler
|
| 320 |
-
latents = latents * self.scheduler.init_noise_sigma
|
| 321 |
-
|
| 322 |
-
torch.cuda.synchronize()
|
| 323 |
-
e2e_tic = time.perf_counter()
|
| 324 |
-
|
| 325 |
-
if self.nvtx_profile:
|
| 326 |
-
nvtx_clip = nvtx.start_range(message='clip', color='green')
|
| 327 |
-
cudart.cudaEventRecord(events['clip-start'], 0)
|
| 328 |
-
# Tokenize input
|
| 329 |
-
text_input_ids = self.tokenizer(
|
| 330 |
-
prompt,
|
| 331 |
-
padding="max_length",
|
| 332 |
-
max_length=self.tokenizer.model_max_length,
|
| 333 |
-
return_tensors="pt",
|
| 334 |
-
).input_ids.type(torch.int32).to(self.device)
|
| 335 |
-
|
| 336 |
-
# CLIP text encoder
|
| 337 |
-
text_input_ids_inp = cuda.DeviceView(ptr=text_input_ids.data_ptr(), shape=text_input_ids.shape, dtype=np.int32)
|
| 338 |
-
text_embeddings = self.runEngine('clip', {"input_ids": text_input_ids_inp})['text_embeddings']
|
| 339 |
-
|
| 340 |
-
# Duplicate text embeddings for each generation per prompt
|
| 341 |
-
bs_embed, seq_len, _ = text_embeddings.shape
|
| 342 |
-
text_embeddings = text_embeddings.repeat(1, self.num_images, 1)
|
| 343 |
-
text_embeddings = text_embeddings.view(bs_embed * self.num_images, seq_len, -1)
|
| 344 |
-
|
| 345 |
-
max_length = text_input_ids.shape[-1]
|
| 346 |
-
uncond_input_ids = self.tokenizer(
|
| 347 |
-
negative_prompt,
|
| 348 |
-
padding="max_length",
|
| 349 |
-
max_length=max_length,
|
| 350 |
-
truncation=True,
|
| 351 |
-
return_tensors="pt",
|
| 352 |
-
).input_ids.type(torch.int32).to(self.device)
|
| 353 |
-
uncond_input_ids_inp = cuda.DeviceView(ptr=uncond_input_ids.data_ptr(), shape=uncond_input_ids.shape, dtype=np.int32)
|
| 354 |
-
uncond_embeddings = self.runEngine('clip', {"input_ids": uncond_input_ids_inp})['text_embeddings']
|
| 355 |
-
|
| 356 |
-
# Duplicate unconditional embeddings for each generation per prompt
|
| 357 |
-
seq_len = uncond_embeddings.shape[1]
|
| 358 |
-
uncond_embeddings = uncond_embeddings.repeat(1, self.num_images, 1)
|
| 359 |
-
uncond_embeddings = uncond_embeddings.view(batch_size * self.num_images, seq_len, -1)
|
| 360 |
-
|
| 361 |
-
# Concatenate the unconditional and text embeddings into a single batch to avoid doing two forward passes for classifier free guidance
|
| 362 |
-
text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
|
| 363 |
-
|
| 364 |
-
if self.denoising_fp16:
|
| 365 |
-
text_embeddings = text_embeddings.to(dtype=torch.float16)
|
| 366 |
-
|
| 367 |
-
cudart.cudaEventRecord(events['clip-stop'], 0)
|
| 368 |
-
if self.nvtx_profile:
|
| 369 |
-
nvtx.end_range(nvtx_clip)
|
| 370 |
-
|
| 371 |
-
cudart.cudaEventRecord(events['denoise-start'], 0)
|
| 372 |
-
for step_index, timestep in enumerate(self.scheduler.timesteps):
|
| 373 |
-
if self.nvtx_profile:
|
| 374 |
-
nvtx_latent_scale = nvtx.start_range(message='latent_scale', color='pink')
|
| 375 |
-
# expand the latents if we are doing classifier free guidance
|
| 376 |
-
latent_model_input = torch.cat([latents] * 2)
|
| 377 |
-
# LMSDiscreteScheduler.scale_model_input()
|
| 378 |
-
latent_model_input = self.scheduler.scale_model_input(latent_model_input, step_index)
|
| 379 |
-
if self.nvtx_profile:
|
| 380 |
-
nvtx.end_range(nvtx_latent_scale)
|
| 381 |
-
|
| 382 |
-
# predict the noise residual
|
| 383 |
-
if self.nvtx_profile:
|
| 384 |
-
nvtx_unet = nvtx.start_range(message='unet', color='blue')
|
| 385 |
-
dtype = np.float16 if self.denoising_fp16 else np.float32
|
| 386 |
-
if timestep.dtype != torch.float32:
|
| 387 |
-
timestep_float = timestep.float()
|
| 388 |
-
else:
|
| 389 |
-
timestep_float = timestep
|
| 390 |
-
sample_inp = cuda.DeviceView(ptr=latent_model_input.data_ptr(), shape=latent_model_input.shape, dtype=np.float32)
|
| 391 |
-
timestep_inp = cuda.DeviceView(ptr=timestep_float.data_ptr(), shape=timestep_float.shape, dtype=np.float32)
|
| 392 |
-
embeddings_inp = cuda.DeviceView(ptr=text_embeddings.data_ptr(), shape=text_embeddings.shape, dtype=dtype)
|
| 393 |
-
noise_pred = self.runEngine(self.unet_model_key, {"sample": sample_inp, "timestep": timestep_inp, "encoder_hidden_states": embeddings_inp})['latent']
|
| 394 |
-
if self.nvtx_profile:
|
| 395 |
-
nvtx.end_range(nvtx_unet)
|
| 396 |
-
|
| 397 |
-
if self.nvtx_profile:
|
| 398 |
-
nvtx_latent_step = nvtx.start_range(message='latent_step', color='pink')
|
| 399 |
-
# Perform guidance
|
| 400 |
-
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
|
| 401 |
-
noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
|
| 402 |
-
|
| 403 |
-
latents = self.scheduler.step(noise_pred, latents, step_index, timestep)
|
| 404 |
-
|
| 405 |
-
if self.nvtx_profile:
|
| 406 |
-
nvtx.end_range(nvtx_latent_step)
|
| 407 |
-
|
| 408 |
-
latents = 1. / 0.18215 * latents
|
| 409 |
-
cudart.cudaEventRecord(events['denoise-stop'], 0)
|
| 410 |
-
|
| 411 |
-
if self.nvtx_profile:
|
| 412 |
-
nvtx_vae = nvtx.start_range(message='vae', color='red')
|
| 413 |
-
cudart.cudaEventRecord(events['vae-start'], 0)
|
| 414 |
-
sample_inp = cuda.DeviceView(ptr=latents.data_ptr(), shape=latents.shape, dtype=np.float32)
|
| 415 |
-
images = self.runEngine('vae', {"latent": sample_inp})['images']
|
| 416 |
-
cudart.cudaEventRecord(events['vae-stop'], 0)
|
| 417 |
-
if self.nvtx_profile:
|
| 418 |
-
nvtx.end_range(nvtx_vae)
|
| 419 |
-
|
| 420 |
-
torch.cuda.synchronize()
|
| 421 |
-
e2e_toc = time.perf_counter()
|
| 422 |
-
if not warmup:
|
| 423 |
-
print('|------------|--------------|')
|
| 424 |
-
print('| {:^10} | {:^12} |'.format('Module', 'Latency'))
|
| 425 |
-
print('|------------|--------------|')
|
| 426 |
-
print('| {:^10} | {:>9.2f} ms |'.format('CLIP', cudart.cudaEventElapsedTime(events['clip-start'], events['clip-stop'])[1]))
|
| 427 |
-
print('| {:^10} | {:>9.2f} ms |'.format('UNet x '+str(self.denoising_steps), cudart.cudaEventElapsedTime(events['denoise-start'], events['denoise-stop'])[1]))
|
| 428 |
-
print('| {:^10} | {:>9.2f} ms |'.format('VAE', cudart.cudaEventElapsedTime(events['vae-start'], events['vae-stop'])[1]))
|
| 429 |
-
print('|------------|--------------|')
|
| 430 |
-
print('| {:^10} | {:>9.2f} ms |'.format('Pipeline', (e2e_toc - e2e_tic)*1000.))
|
| 431 |
-
print('|------------|--------------|')
|
| 432 |
-
|
| 433 |
-
# Save image
|
| 434 |
-
image_name_prefix = 'sd-'+('fp16' if self.denoising_fp16 else 'fp32')+''.join(set(['-'+prompt[i].replace(' ','_')[:10] for i in range(batch_size)]))+'-'
|
| 435 |
-
save_image(images, self.output_dir, image_name_prefix)
|
| 436 |
-
|
| 437 |
-
if __name__ == "__main__":
|
| 438 |
-
|
| 439 |
-
print("[I] Initializing StableDiffusion demo with TensorRT Plugins")
|
| 440 |
-
args = parseArgs()
|
| 441 |
-
|
| 442 |
-
# Process prompt
|
| 443 |
-
if not isinstance(args.prompt, list):
|
| 444 |
-
raise ValueError(f"`prompt` must be of type `str` or `str` list, but is {type(args.prompt)}")
|
| 445 |
-
prompt = args.prompt * args.repeat_prompt
|
| 446 |
-
|
| 447 |
-
if not isinstance(args.negative_prompt, list):
|
| 448 |
-
raise ValueError(f"`--negative-prompt` must be of type `str` or `str` list, but is {type(args.negative_prompt)}")
|
| 449 |
-
if len(args.negative_prompt) == 1:
|
| 450 |
-
negative_prompt = args.negative_prompt * len(prompt)
|
| 451 |
-
else:
|
| 452 |
-
negative_prompt = args.negative_prompt
|
| 453 |
-
|
| 454 |
-
max_batch_size = 16
|
| 455 |
-
if args.build_dynamic_shape:
|
| 456 |
-
max_batch_size = 4
|
| 457 |
-
|
| 458 |
-
if len(prompt) > max_batch_size:
|
| 459 |
-
raise ValueError(f"Batch size {len(prompt)} is larger than allowed {max_batch_size}. If dynamic shape is used, then maximum batch size is 4")
|
| 460 |
-
|
| 461 |
-
# Validate image dimensions
|
| 462 |
-
image_height = args.height
|
| 463 |
-
image_width = args.width
|
| 464 |
-
if image_height % 8 != 0 or image_width % 8 != 0:
|
| 465 |
-
raise ValueError(f"Image height and width have to be divisible by 8 but specified as: {image_height} and {image_width}.")
|
| 466 |
-
|
| 467 |
-
# Register TensorRT plugins
|
| 468 |
-
trt.init_libnvinfer_plugins(TRT_LOGGER, '')
|
| 469 |
-
|
| 470 |
-
# Initialize demo
|
| 471 |
-
demo = DemoDiffusion(
|
| 472 |
-
denoising_steps=args.denoising_steps,
|
| 473 |
-
denoising_fp16=(args.denoising_prec == 'fp16'),
|
| 474 |
-
output_dir=args.output_dir,
|
| 475 |
-
scheduler=args.scheduler,
|
| 476 |
-
hf_token=args.hf_token,
|
| 477 |
-
verbose=args.verbose,
|
| 478 |
-
nvtx_profile=args.nvtx_profile,
|
| 479 |
-
max_batch_size=max_batch_size)
|
| 480 |
-
|
| 481 |
-
# Load TensorRT engines and pytorch modules
|
| 482 |
-
demo.loadEngines(args.engine_dir, args.onnx_dir, args.onnx_opset,
|
| 483 |
-
opt_batch_size=len(prompt), opt_image_height=image_height, opt_image_width=image_width, \
|
| 484 |
-
force_export=args.force_onnx_export, force_optimize=args.force_onnx_optimize, \
|
| 485 |
-
force_build=args.force_engine_build, minimal_optimization=args.onnx_minimal_optimization, \
|
| 486 |
-
static_batch=args.build_static_batch, static_shape=not args.build_dynamic_shape, \
|
| 487 |
-
enable_preview=args.build_preview_features)
|
| 488 |
-
demo.loadModules()
|
| 489 |
-
|
| 490 |
-
print("[I] Warming up ..")
|
| 491 |
-
for _ in range(args.num_warmup_runs):
|
| 492 |
-
images = demo.infer(prompt, negative_prompt, image_height, image_width, warmup=True, verbose=False)
|
| 493 |
-
|
| 494 |
-
print("[I] Running StableDiffusion pipeline")
|
| 495 |
-
if args.nvtx_profile:
|
| 496 |
-
cudart.cudaProfilerStart()
|
| 497 |
-
images = demo.infer(prompt, negative_prompt, image_height, image_width, verbose=args.verbose)
|
| 498 |
-
if args.nvtx_profile:
|
| 499 |
-
cudart.cudaProfilerStop()
|
| 500 |
-
|
| 501 |
-
demo.teardown()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|