camenduru commited on
Commit
989174e
·
1 Parent(s): 9dc3a34

Delete demo-diffusion.py

Browse files
Files changed (1) hide show
  1. demo-diffusion.py +0 -501
demo-diffusion.py DELETED
@@ -1,501 +0,0 @@
1
- #
2
- # SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3
- # SPDX-License-Identifier: Apache-2.0
4
- #
5
- # Licensed under the Apache License, Version 2.0 (the "License");
6
- # you may not use this file except in compliance with the License.
7
- # You may obtain a copy of the License at
8
- #
9
- # http://www.apache.org/licenses/LICENSE-2.0
10
- #
11
- # Unless required by applicable law or agreed to in writing, software
12
- # distributed under the License is distributed on an "AS IS" BASIS,
13
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
- # See the License for the specific language governing permissions and
15
- # limitations under the License.
16
- #
17
-
18
- import argparse
19
- from cuda import cudart
20
- from models import CLIP, UNet, VAE
21
- import numpy as np
22
- import nvtx
23
- import os
24
- import onnx
25
- from polygraphy import cuda
26
- import time
27
- import torch
28
- from transformers import CLIPTokenizer
29
- import tensorrt as trt
30
- from utilities import Engine, DPMScheduler, LMSDiscreteScheduler, save_image, TRT_LOGGER
31
-
32
- def parseArgs():
33
- parser = argparse.ArgumentParser(description="Options for Stable Diffusion Demo")
34
- # Stable Diffusion configuration
35
- parser.add_argument('prompt', nargs = '*', help="Text prompt(s) to guide image generation")
36
- parser.add_argument('--negative-prompt', nargs = '*', default=[''], help="The negative prompt(s) to guide the image generation.")
37
- parser.add_argument('--repeat-prompt', type=int, default=1, choices=[1, 2, 4, 8, 16], help="Number of times to repeat the prompt (batch size multiplier)")
38
- parser.add_argument('--height', type=int, default=512, help="Height of image to generate (must be multiple of 8)")
39
- parser.add_argument('--width', type=int, default=512, help="Height of image to generate (must be multiple of 8)")
40
- parser.add_argument('--num-images', type=int, default=1, help="Number of images to generate per prompt")
41
- parser.add_argument('--denoising-steps', type=int, default=50, help="Number of denoising steps")
42
- parser.add_argument('--denoising-prec', type=str, default='fp16', choices=['fp32', 'fp16'], help="Denoiser model precision")
43
- parser.add_argument('--scheduler', type=str, default="LMSD", choices=["LMSD", "DPM"], help="Scheduler for diffusion process")
44
-
45
- # ONNX export
46
- parser.add_argument('--onnx-opset', type=int, default=16, choices=range(7,18), help="Select ONNX opset version to target for exported models")
47
- parser.add_argument('--onnx-dir', default='onnx', help="Output directory for ONNX export")
48
- parser.add_argument('--force-onnx-export', action='store_true', help="Force ONNX export of CLIP, UNET, and VAE models")
49
- parser.add_argument('--force-onnx-optimize', action='store_true', help="Force ONNX optimizations for CLIP, UNET, and VAE models")
50
- parser.add_argument('--onnx-minimal-optimization', action='store_true', help="Restrict ONNX optimization to const folding and shape inference.")
51
-
52
- # TensorRT engine build
53
- parser.add_argument('--engine-dir', default='engine', help="Output directory for TensorRT engines")
54
- parser.add_argument('--force-engine-build', action='store_true', help="Force rebuilding the TensorRT engine")
55
- parser.add_argument('--build-static-batch', action='store_true', help="Build TensorRT engines with fixed batch size.")
56
- parser.add_argument('--build-dynamic-shape', action='store_true', help="Build TensorRT engines with dynamic image shapes.")
57
- parser.add_argument('--build-preview-features', action='store_true', help="Build TensorRT engines with preview features.")
58
-
59
- # TensorRT inference
60
- parser.add_argument('--num-warmup-runs', type=int, default=5, help="Number of warmup runs before benchmarking performance")
61
- parser.add_argument('--nvtx-profile', action='store_true', help="Enable NVTX markers for performance profiling")
62
- parser.add_argument('--seed', type=int, default=None, help="Seed for random generator to get consistent results")
63
-
64
- parser.add_argument('--output-dir', default='output', help="Output directory for logs and image artifacts")
65
- parser.add_argument('--hf-token', type=str, help="HuggingFace API access token for downloading model checkpoints")
66
- parser.add_argument('-v', '--verbose', action='store_true', help="Show verbose output")
67
- return parser.parse_args()
68
-
69
- class DemoDiffusion:
70
- """
71
- Application showcasing the acceleration of Stable Diffusion v1.4 pipeline using NVidia TensorRT w/ Plugins.
72
- """
73
- def __init__(
74
- self,
75
- denoising_steps,
76
- denoising_fp16=True,
77
- scheduler="LMSD",
78
- guidance_scale=7.5,
79
- device='cuda',
80
- output_dir='.',
81
- hf_token=None,
82
- verbose=False,
83
- nvtx_profile=False,
84
- max_batch_size=16
85
- ):
86
- """
87
- Initializes the Diffusion pipeline.
88
-
89
- Args:
90
- denoising_steps (int):
91
- The number of denoising steps.
92
- More denoising steps usually lead to a higher quality image at the expense of slower inference.
93
- denoising_fp16 (bool):
94
- Run the denoising loop (UNet) in fp16 precision.
95
- When enabled image quality will be lower but generally results in higher throughput.
96
- guidance_scale (float):
97
- Guidance scale is enabled by setting as > 1.
98
- Higher guidance scale encourages to generate images that are closely linked to the text prompt, usually at the expense of lower image quality.
99
- device (str):
100
- PyTorch device to run inference. Default: 'cuda'
101
- output_dir (str):
102
- Output directory for log files and image artifacts
103
- hf_token (str):
104
- HuggingFace User Access Token to use for downloading Stable Diffusion model checkpoints.
105
- verbose (bool):
106
- Enable verbose logging.
107
- nvtx_profile (bool):
108
- Insert NVTX profiling markers.
109
- max_batch_size (int):
110
- Max batch size for dynamic batch engines.
111
- """
112
- # Only supports single image per prompt.
113
- self.num_images = 1
114
-
115
- self.denoising_steps = denoising_steps
116
- self.denoising_fp16 = denoising_fp16
117
- assert guidance_scale > 1.0
118
- self.guidance_scale = guidance_scale
119
-
120
- self.output_dir = output_dir
121
- self.hf_token = hf_token
122
- self.device = device
123
- self.verbose = verbose
124
- self.nvtx_profile = nvtx_profile
125
-
126
- # A scheduler to be used in combination with unet to denoise the encoded image latens.
127
- # This demo uses an adaptation of LMSDiscreteScheduler or DPMScheduler:
128
- sched_opts = {'num_train_timesteps': 1000, 'beta_start': 0.00085, 'beta_end': 0.012}
129
- if scheduler == "DPM":
130
- self.scheduler = DPMScheduler(device=self.device, **sched_opts)
131
- elif scheduler == "LMSD":
132
- self.scheduler = LMSDiscreteScheduler(device=self.device, **sched_opts)
133
- else:
134
- raise ValueError(f"Scheduler should be either DPM or LMSD")
135
-
136
- self.tokenizer = None
137
-
138
- self.unet_model_key = 'unet_fp16' if denoising_fp16 else 'unet'
139
- self.models = {
140
- 'clip': CLIP(hf_token=hf_token, device=device, verbose=verbose, max_batch_size=max_batch_size),
141
- self.unet_model_key: UNet(hf_token=hf_token, fp16=denoising_fp16, device=device, verbose=verbose, max_batch_size=max_batch_size),
142
- 'vae': VAE(hf_token=hf_token, device=device, verbose=verbose, max_batch_size=max_batch_size)
143
- }
144
-
145
- self.engine = {}
146
- self.stream = cuda.Stream()
147
-
148
- def teardown(self):
149
- for engine in self.engine.values():
150
- del engine
151
- self.stream.free()
152
- del self.stream
153
-
154
- def getModelPath(self, name, onnx_dir, opt=True):
155
- return os.path.join(onnx_dir, name+('.opt' if opt else '')+'.onnx')
156
-
157
- def loadEngines(
158
- self,
159
- engine_dir,
160
- onnx_dir,
161
- onnx_opset,
162
- opt_batch_size,
163
- opt_image_height,
164
- opt_image_width,
165
- force_export=False,
166
- force_optimize=False,
167
- force_build=False,
168
- minimal_optimization=False,
169
- static_batch=False,
170
- static_shape=True,
171
- enable_preview=False,
172
- ):
173
- """
174
- Build and load engines for TensorRT accelerated inference.
175
- Export ONNX models first, if applicable.
176
-
177
- Args:
178
- engine_dir (str):
179
- Directory to write the TensorRT engines.
180
- onnx_dir (str):
181
- Directory to write the ONNX models.
182
- onnx_opset (int):
183
- ONNX opset version to export the models.
184
- opt_batch_size (int):
185
- Batch size to optimize for during engine building.
186
- opt_image_height (int):
187
- Image height to optimize for during engine building. Must be a multiple of 8.
188
- opt_image_width (int):
189
- Image width to optimize for during engine building. Must be a multiple of 8.
190
- force_export (bool):
191
- Force re-exporting the ONNX models.
192
- force_optimize (bool):
193
- Force re-optimizing the ONNX models.
194
- force_build (bool):
195
- Force re-building the TensorRT engine.
196
- minimal_optimization (bool):
197
- Apply minimal optimizations during build (no plugins).
198
- static_batch (bool):
199
- Build engine only for specified opt_batch_size.
200
- static_shape (bool):
201
- Build engine only for specified opt_image_height & opt_image_width. Default = True.
202
- enable_preview (bool):
203
- Enable TensorRT preview features.
204
- """
205
-
206
- # Build engines
207
- for model_name, obj in self.models.items():
208
- engine = Engine(model_name, engine_dir)
209
- if force_build or not os.path.exists(engine.engine_path):
210
- onnx_path = self.getModelPath(model_name, onnx_dir, opt=False)
211
- onnx_opt_path = self.getModelPath(model_name, onnx_dir)
212
- if not os.path.exists(onnx_opt_path):
213
- # Export onnx
214
- if force_export or not os.path.exists(onnx_path):
215
- print(f"Exporting model: {onnx_path}")
216
- model = obj.get_model()
217
- with torch.inference_mode(), torch.autocast("cuda"):
218
- inputs = obj.get_sample_input(opt_batch_size, opt_image_height, opt_image_width)
219
- torch.onnx.export(model,
220
- inputs,
221
- onnx_path,
222
- export_params=True,
223
- opset_version=onnx_opset,
224
- do_constant_folding=True,
225
- input_names = obj.get_input_names(),
226
- output_names = obj.get_output_names(),
227
- dynamic_axes=obj.get_dynamic_axes(),
228
- )
229
- else:
230
- print(f"Found cached model: {onnx_path}")
231
-
232
- # Optimize onnx
233
- if force_optimize or not os.path.exists(onnx_opt_path):
234
- print(f"Generating optimizing model: {onnx_opt_path}")
235
- onnx_opt_graph = obj.optimize(onnx.load(onnx_path), minimal_optimization=minimal_optimization)
236
- onnx.save(onnx_opt_graph, onnx_opt_path)
237
- else:
238
- print(f"Found cached optimized model: {onnx_opt_path} ")
239
-
240
- # Build engine
241
- engine.build(onnx_opt_path, fp16=True, \
242
- input_profile=obj.get_input_profile(opt_batch_size, opt_image_height, opt_image_width, \
243
- static_batch=static_batch, static_shape=static_shape), \
244
- enable_preview=enable_preview)
245
- self.engine[model_name] = engine
246
-
247
- # Separate iteration to activate engines
248
- for model_name, obj in self.models.items():
249
- self.engine[model_name].activate()
250
-
251
- def loadModules(
252
- self,
253
- ):
254
- self.tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
255
- self.scheduler.set_timesteps(self.denoising_steps)
256
- # Pre-compute latent input scales and linear multistep coefficients
257
- self.scheduler.configure()
258
-
259
- def runEngine(self, model_name, feed_dict):
260
- engine = self.engine[model_name]
261
- return engine.infer(feed_dict, self.stream)
262
-
263
- def infer(
264
- self,
265
- prompt,
266
- negative_prompt,
267
- image_height,
268
- image_width,
269
- warmup = False,
270
- verbose = False,
271
- ):
272
- """
273
- Run the diffusion pipeline.
274
-
275
- Args:
276
- prompt (str):
277
- The text prompt to guide image generation.
278
- negative_prompt (str):
279
- The prompt not to guide the image generation.
280
- image_height (int):
281
- Height (in pixels) of the image to be generated. Must be a multiple of 8.
282
- image_width (int):
283
- Width (in pixels) of the image to be generated. Must be a multiple of 8.
284
- warmup (bool):
285
- Indicate if this is a warmup run.
286
- verbose (bool):
287
- Enable verbose logging.
288
- """
289
- # Process inputs
290
- batch_size = len(prompt)
291
- assert len(prompt) == len(negative_prompt)
292
-
293
- # Spatial dimensions of latent tensor
294
- latent_height = image_height // 8
295
- latent_width = image_width // 8
296
-
297
- # Create profiling events
298
- events = {}
299
- for stage in ['clip', 'denoise', 'vae']:
300
- for marker in ['start', 'stop']:
301
- events[stage+'-'+marker] = cudart.cudaEventCreate()[1]
302
-
303
- # Allocate buffers for TensorRT engine bindings
304
- for model_name, obj in self.models.items():
305
- self.engine[model_name].allocate_buffers(shape_dict=obj.get_shape_dict(batch_size, image_height, image_width), device=self.device)
306
-
307
- generator = None
308
- if args.seed is not None:
309
- generator = torch.Generator(device="cuda").manual_seed(args.seed)
310
-
311
- # Run Stable Diffusion pipeline
312
- with torch.inference_mode(), torch.autocast("cuda"), trt.Runtime(TRT_LOGGER) as runtime:
313
- # latents need to be generated on the target device
314
- unet_channels = 4 # unet.in_channels
315
- latents_shape = (batch_size * self.num_images, unet_channels, latent_height, latent_width)
316
- latents_dtype = torch.float32 # text_embeddings.dtype
317
- latents = torch.randn(latents_shape, device=self.device, dtype=latents_dtype, generator=generator)
318
-
319
- # Scale the initial noise by the standard deviation required by the scheduler
320
- latents = latents * self.scheduler.init_noise_sigma
321
-
322
- torch.cuda.synchronize()
323
- e2e_tic = time.perf_counter()
324
-
325
- if self.nvtx_profile:
326
- nvtx_clip = nvtx.start_range(message='clip', color='green')
327
- cudart.cudaEventRecord(events['clip-start'], 0)
328
- # Tokenize input
329
- text_input_ids = self.tokenizer(
330
- prompt,
331
- padding="max_length",
332
- max_length=self.tokenizer.model_max_length,
333
- return_tensors="pt",
334
- ).input_ids.type(torch.int32).to(self.device)
335
-
336
- # CLIP text encoder
337
- text_input_ids_inp = cuda.DeviceView(ptr=text_input_ids.data_ptr(), shape=text_input_ids.shape, dtype=np.int32)
338
- text_embeddings = self.runEngine('clip', {"input_ids": text_input_ids_inp})['text_embeddings']
339
-
340
- # Duplicate text embeddings for each generation per prompt
341
- bs_embed, seq_len, _ = text_embeddings.shape
342
- text_embeddings = text_embeddings.repeat(1, self.num_images, 1)
343
- text_embeddings = text_embeddings.view(bs_embed * self.num_images, seq_len, -1)
344
-
345
- max_length = text_input_ids.shape[-1]
346
- uncond_input_ids = self.tokenizer(
347
- negative_prompt,
348
- padding="max_length",
349
- max_length=max_length,
350
- truncation=True,
351
- return_tensors="pt",
352
- ).input_ids.type(torch.int32).to(self.device)
353
- uncond_input_ids_inp = cuda.DeviceView(ptr=uncond_input_ids.data_ptr(), shape=uncond_input_ids.shape, dtype=np.int32)
354
- uncond_embeddings = self.runEngine('clip', {"input_ids": uncond_input_ids_inp})['text_embeddings']
355
-
356
- # Duplicate unconditional embeddings for each generation per prompt
357
- seq_len = uncond_embeddings.shape[1]
358
- uncond_embeddings = uncond_embeddings.repeat(1, self.num_images, 1)
359
- uncond_embeddings = uncond_embeddings.view(batch_size * self.num_images, seq_len, -1)
360
-
361
- # Concatenate the unconditional and text embeddings into a single batch to avoid doing two forward passes for classifier free guidance
362
- text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
363
-
364
- if self.denoising_fp16:
365
- text_embeddings = text_embeddings.to(dtype=torch.float16)
366
-
367
- cudart.cudaEventRecord(events['clip-stop'], 0)
368
- if self.nvtx_profile:
369
- nvtx.end_range(nvtx_clip)
370
-
371
- cudart.cudaEventRecord(events['denoise-start'], 0)
372
- for step_index, timestep in enumerate(self.scheduler.timesteps):
373
- if self.nvtx_profile:
374
- nvtx_latent_scale = nvtx.start_range(message='latent_scale', color='pink')
375
- # expand the latents if we are doing classifier free guidance
376
- latent_model_input = torch.cat([latents] * 2)
377
- # LMSDiscreteScheduler.scale_model_input()
378
- latent_model_input = self.scheduler.scale_model_input(latent_model_input, step_index)
379
- if self.nvtx_profile:
380
- nvtx.end_range(nvtx_latent_scale)
381
-
382
- # predict the noise residual
383
- if self.nvtx_profile:
384
- nvtx_unet = nvtx.start_range(message='unet', color='blue')
385
- dtype = np.float16 if self.denoising_fp16 else np.float32
386
- if timestep.dtype != torch.float32:
387
- timestep_float = timestep.float()
388
- else:
389
- timestep_float = timestep
390
- sample_inp = cuda.DeviceView(ptr=latent_model_input.data_ptr(), shape=latent_model_input.shape, dtype=np.float32)
391
- timestep_inp = cuda.DeviceView(ptr=timestep_float.data_ptr(), shape=timestep_float.shape, dtype=np.float32)
392
- embeddings_inp = cuda.DeviceView(ptr=text_embeddings.data_ptr(), shape=text_embeddings.shape, dtype=dtype)
393
- noise_pred = self.runEngine(self.unet_model_key, {"sample": sample_inp, "timestep": timestep_inp, "encoder_hidden_states": embeddings_inp})['latent']
394
- if self.nvtx_profile:
395
- nvtx.end_range(nvtx_unet)
396
-
397
- if self.nvtx_profile:
398
- nvtx_latent_step = nvtx.start_range(message='latent_step', color='pink')
399
- # Perform guidance
400
- noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
401
- noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
402
-
403
- latents = self.scheduler.step(noise_pred, latents, step_index, timestep)
404
-
405
- if self.nvtx_profile:
406
- nvtx.end_range(nvtx_latent_step)
407
-
408
- latents = 1. / 0.18215 * latents
409
- cudart.cudaEventRecord(events['denoise-stop'], 0)
410
-
411
- if self.nvtx_profile:
412
- nvtx_vae = nvtx.start_range(message='vae', color='red')
413
- cudart.cudaEventRecord(events['vae-start'], 0)
414
- sample_inp = cuda.DeviceView(ptr=latents.data_ptr(), shape=latents.shape, dtype=np.float32)
415
- images = self.runEngine('vae', {"latent": sample_inp})['images']
416
- cudart.cudaEventRecord(events['vae-stop'], 0)
417
- if self.nvtx_profile:
418
- nvtx.end_range(nvtx_vae)
419
-
420
- torch.cuda.synchronize()
421
- e2e_toc = time.perf_counter()
422
- if not warmup:
423
- print('|------------|--------------|')
424
- print('| {:^10} | {:^12} |'.format('Module', 'Latency'))
425
- print('|------------|--------------|')
426
- print('| {:^10} | {:>9.2f} ms |'.format('CLIP', cudart.cudaEventElapsedTime(events['clip-start'], events['clip-stop'])[1]))
427
- print('| {:^10} | {:>9.2f} ms |'.format('UNet x '+str(self.denoising_steps), cudart.cudaEventElapsedTime(events['denoise-start'], events['denoise-stop'])[1]))
428
- print('| {:^10} | {:>9.2f} ms |'.format('VAE', cudart.cudaEventElapsedTime(events['vae-start'], events['vae-stop'])[1]))
429
- print('|------------|--------------|')
430
- print('| {:^10} | {:>9.2f} ms |'.format('Pipeline', (e2e_toc - e2e_tic)*1000.))
431
- print('|------------|--------------|')
432
-
433
- # Save image
434
- image_name_prefix = 'sd-'+('fp16' if self.denoising_fp16 else 'fp32')+''.join(set(['-'+prompt[i].replace(' ','_')[:10] for i in range(batch_size)]))+'-'
435
- save_image(images, self.output_dir, image_name_prefix)
436
-
437
- if __name__ == "__main__":
438
-
439
- print("[I] Initializing StableDiffusion demo with TensorRT Plugins")
440
- args = parseArgs()
441
-
442
- # Process prompt
443
- if not isinstance(args.prompt, list):
444
- raise ValueError(f"`prompt` must be of type `str` or `str` list, but is {type(args.prompt)}")
445
- prompt = args.prompt * args.repeat_prompt
446
-
447
- if not isinstance(args.negative_prompt, list):
448
- raise ValueError(f"`--negative-prompt` must be of type `str` or `str` list, but is {type(args.negative_prompt)}")
449
- if len(args.negative_prompt) == 1:
450
- negative_prompt = args.negative_prompt * len(prompt)
451
- else:
452
- negative_prompt = args.negative_prompt
453
-
454
- max_batch_size = 16
455
- if args.build_dynamic_shape:
456
- max_batch_size = 4
457
-
458
- if len(prompt) > max_batch_size:
459
- raise ValueError(f"Batch size {len(prompt)} is larger than allowed {max_batch_size}. If dynamic shape is used, then maximum batch size is 4")
460
-
461
- # Validate image dimensions
462
- image_height = args.height
463
- image_width = args.width
464
- if image_height % 8 != 0 or image_width % 8 != 0:
465
- raise ValueError(f"Image height and width have to be divisible by 8 but specified as: {image_height} and {image_width}.")
466
-
467
- # Register TensorRT plugins
468
- trt.init_libnvinfer_plugins(TRT_LOGGER, '')
469
-
470
- # Initialize demo
471
- demo = DemoDiffusion(
472
- denoising_steps=args.denoising_steps,
473
- denoising_fp16=(args.denoising_prec == 'fp16'),
474
- output_dir=args.output_dir,
475
- scheduler=args.scheduler,
476
- hf_token=args.hf_token,
477
- verbose=args.verbose,
478
- nvtx_profile=args.nvtx_profile,
479
- max_batch_size=max_batch_size)
480
-
481
- # Load TensorRT engines and pytorch modules
482
- demo.loadEngines(args.engine_dir, args.onnx_dir, args.onnx_opset,
483
- opt_batch_size=len(prompt), opt_image_height=image_height, opt_image_width=image_width, \
484
- force_export=args.force_onnx_export, force_optimize=args.force_onnx_optimize, \
485
- force_build=args.force_engine_build, minimal_optimization=args.onnx_minimal_optimization, \
486
- static_batch=args.build_static_batch, static_shape=not args.build_dynamic_shape, \
487
- enable_preview=args.build_preview_features)
488
- demo.loadModules()
489
-
490
- print("[I] Warming up ..")
491
- for _ in range(args.num_warmup_runs):
492
- images = demo.infer(prompt, negative_prompt, image_height, image_width, warmup=True, verbose=False)
493
-
494
- print("[I] Running StableDiffusion pipeline")
495
- if args.nvtx_profile:
496
- cudart.cudaProfilerStart()
497
- images = demo.infer(prompt, negative_prompt, image_height, image_width, verbose=args.verbose)
498
- if args.nvtx_profile:
499
- cudart.cudaProfilerStop()
500
-
501
- demo.teardown()