ComfyUI-Upscaler-Tensorrt / trt_utilities.py
aliensmn's picture
Mirror from https://github.com/yuvraj108c/ComfyUI-Upscaler-Tensorrt
afaf90f verified
#
# Copyright 2022 The HuggingFace Inc. team.
# SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import torch
from torch.cuda import nvtx
from collections import OrderedDict
import numpy as np
from polygraphy.backend.common import bytes_from_path
from polygraphy import util
from polygraphy.backend.trt import ModifyNetworkOutputs, Profile
from polygraphy.backend.trt import (
engine_from_bytes,
engine_from_network,
network_from_onnx_path,
save_engine,
)
from polygraphy.logger import G_LOGGER
import tensorrt as trt
from logging import error, warning
from tqdm import tqdm
import copy
TRT_LOGGER = trt.Logger(trt.Logger.ERROR)
G_LOGGER.module_severity = G_LOGGER.ERROR
# Map of numpy dtype -> torch dtype
numpy_to_torch_dtype_dict = {
np.uint8: torch.uint8,
np.int8: torch.int8,
np.int16: torch.int16,
np.int32: torch.int32,
np.int64: torch.int64,
np.float16: torch.float16,
np.float32: torch.float32,
np.float64: torch.float64,
np.complex64: torch.complex64,
np.complex128: torch.complex128,
}
if np.version.full_version >= "1.24.0":
numpy_to_torch_dtype_dict[np.bool_] = torch.bool
else:
numpy_to_torch_dtype_dict[np.bool] = torch.bool
# Map of torch dtype -> numpy dtype
torch_to_numpy_dtype_dict = {
value: key for (key, value) in numpy_to_torch_dtype_dict.items()
}
class TQDMProgressMonitor(trt.IProgressMonitor):
def __init__(self):
trt.IProgressMonitor.__init__(self)
self._active_phases = {}
self._step_result = True
self.max_indent = 5
def phase_start(self, phase_name, parent_phase, num_steps):
leave = False
try:
if parent_phase is not None:
nbIndents = (
self._active_phases.get(parent_phase, {}).get(
"nbIndents", self.max_indent
)
+ 1
)
if nbIndents >= self.max_indent:
return
else:
nbIndents = 0
leave = True
self._active_phases[phase_name] = {
"tq": tqdm(
total=num_steps, desc=phase_name, leave=leave, position=nbIndents
),
"nbIndents": nbIndents,
"parent_phase": parent_phase,
}
except KeyboardInterrupt:
# The phase_start callback cannot directly cancel the build, so request the cancellation from within step_complete.
_step_result = False
def phase_finish(self, phase_name):
try:
if phase_name in self._active_phases.keys():
self._active_phases[phase_name]["tq"].update(
self._active_phases[phase_name]["tq"].total
- self._active_phases[phase_name]["tq"].n
)
parent_phase = self._active_phases[phase_name].get("parent_phase", None)
while parent_phase is not None:
self._active_phases[parent_phase]["tq"].refresh()
parent_phase = self._active_phases[parent_phase].get(
"parent_phase", None
)
if (
self._active_phases[phase_name]["parent_phase"]
in self._active_phases.keys()
):
self._active_phases[
self._active_phases[phase_name]["parent_phase"]
]["tq"].refresh()
del self._active_phases[phase_name]
pass
except KeyboardInterrupt:
_step_result = False
def step_complete(self, phase_name, step):
try:
if phase_name in self._active_phases.keys():
self._active_phases[phase_name]["tq"].update(
step - self._active_phases[phase_name]["tq"].n
)
return self._step_result
except KeyboardInterrupt:
# There is no need to propagate this exception to TensorRT. We can simply cancel the build.
return False
class Engine:
def __init__(
self,
engine_path,
):
self.engine_path = engine_path
self.engine = None
self.context = None
self.buffers = OrderedDict()
self.tensors = OrderedDict()
self.cuda_graph_instance = None # cuda graph
def __del__(self):
del self.engine
del self.context
del self.buffers
del self.tensors
def reset(self, engine_path=None):
# del self.engine
del self.context
del self.buffers
del self.tensors
# self.engine_path = engine_path
self.context = None
self.buffers = OrderedDict()
self.tensors = OrderedDict()
self.inputs = {}
self.outputs = {}
def build(
self,
onnx_path,
fp16,
input_profile=None,
enable_refit=False,
enable_preview=False,
enable_all_tactics=False,
timing_cache=None,
update_output_names=None,
):
p = [Profile()]
if input_profile:
p = [Profile() for i in range(len(input_profile))]
for _p, i_profile in zip(p, input_profile):
for name, dims in i_profile.items():
assert len(dims) == 3
_p.add(name, min=dims[0], opt=dims[1], max=dims[2])
config_kwargs = {}
if not enable_all_tactics:
config_kwargs["tactic_sources"] = []
network = network_from_onnx_path(
onnx_path, flags=[trt.OnnxParserFlag.NATIVE_INSTANCENORM]
)
if update_output_names:
print(f"Updating network outputs to {update_output_names}")
network = ModifyNetworkOutputs(network, update_output_names)
builder = network[0]
config = builder.create_builder_config()
config.progress_monitor = TQDMProgressMonitor()
config.set_flag(trt.BuilderFlag.FP16) if fp16 else None
config.set_flag(trt.BuilderFlag.REFIT) if enable_refit else None
profiles = copy.deepcopy(p)
for profile in profiles:
# Last profile is used for set_calibration_profile.
calib_profile = profile.fill_defaults(network[1]).to_trt(
builder, network[1]
)
config.add_optimization_profile(calib_profile)
try:
engine = engine_from_network(
network,
config,
)
except Exception as e:
error(f"Failed to build engine: {e}")
return 1
try:
save_engine(engine, path=self.engine_path)
except Exception as e:
error(f"Failed to save engine: {e}")
return 1
return 0
def load(self):
self.engine = engine_from_bytes(bytes_from_path(self.engine_path))
def activate(self, reuse_device_memory=None):
if reuse_device_memory:
self.context = self.engine.create_execution_context_without_device_memory()
# self.context.device_memory = reuse_device_memory
else:
self.context = self.engine.create_execution_context()
def allocate_buffers(self, shape_dict=None, device="cuda"):
nvtx.range_push("allocate_buffers")
for idx in range(self.engine.num_io_tensors):
name = self.engine.get_tensor_name(idx)
binding = self.engine[idx]
if shape_dict and binding in shape_dict:
shape = shape_dict[binding]["shape"]
else:
shape = self.context.get_tensor_shape(name)
dtype = trt.nptype(self.engine.get_tensor_dtype(name))
if self.engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:
self.context.set_input_shape(name, shape)
tensor = torch.empty(
tuple(shape), dtype=numpy_to_torch_dtype_dict[dtype]
).to(device=device)
self.tensors[binding] = tensor
nvtx.range_pop()
def infer(self, feed_dict, stream, use_cuda_graph=False):
nvtx.range_push("set_tensors")
for name, buf in feed_dict.items():
self.tensors[name].copy_(buf)
for name, tensor in self.tensors.items():
self.context.set_tensor_address(name, tensor.data_ptr())
nvtx.range_pop()
nvtx.range_push("execute")
noerror = self.context.execute_async_v3(stream)
if not noerror:
raise ValueError("ERROR: inference failed.")
nvtx.range_pop()
return self.tensors
def __str__(self):
out = ""
# When raising errors in the upscaler, this str() called by comfy's execution.py,
# but the engine won't have the attributes required for stringification
# If str() also raises an error, comfy gets soft-locked, not running prompts until restarted
if not hasattr(self.engine, "num_optimization_profiles") or not hasattr(self.engine, "num_bindings"):
return out
for opt_profile in range(self.engine.num_optimization_profiles):
for binding_idx in range(self.engine.num_bindings):
name = self.engine.get_binding_name(binding_idx)
shape = self.engine.get_profile_shape(opt_profile, name)
out += f"\t{name} = {shape}\n"
return out