saliacoel
/

MyCustomNodes

ONNX

Safetensors

depth_anything

Model card Files Files and versions

xet

Community

saliacoel commited on Dec 10, 2025

Commit

c6054f8

verified ·

1 Parent(s): 1306c71

Upload tensorrt_loader.py

Browse files

Files changed (1) hide show

tensorrt_loader.py +77 -96

tensorrt_loader.py CHANGED Viewed

@@ -1,5 +1,3 @@
-#Put this in the custom_nodes folder, put your tensorrt engine files in ComfyUI/models/tensorrt/ (you will have to create the directory)
 import torch
 import os
@@ -24,59 +22,59 @@ trt.init_libnvinfer_plugins(None, "")
 logger = trt.Logger(trt.Logger.INFO)
 runtime = trt.Runtime(logger)
-# Is there a function that already exists for this?
 def trt_datatype_to_torch(datatype):
-    if datatype == trt.float16:
         return torch.float16
-    elif datatype == trt.float32:
         return torch.float32
-    elif datatype == trt.int32:
-        return torch.int32
-    elif datatype == trt.bfloat16:
         return torch.bfloat16
 class TrTUnet:
     def __init__(self, engine_path):
         with open(engine_path, "rb") as f:
-            self.engine = runtime.deserialize_cuda_engine(f.read())
         self.context = self.engine.create_execution_context()
-        # default dtype in case something doesn't have a specific TRT dtype
         self.dtype = torch.float16
-    def set_bindings_shape(self, inputs, split_batch):
-        # still here in case something else calls it, but the new __call__
-        # no longer uses this split-batch path
-        for k in inputs:
-            shape = inputs[k].shape
-            shape = [shape[0] // split_batch] + list(shape[1:])
-            self.context.set_input_shape(k, shape)
     def __call__(self, x, timesteps, context, y=None,
                  control=None, transformer_options=None, **kwargs):
         """
-        Run the TensorRT UNet.
-        - `control` and `transformer_options` are accepted for API compatibility
-          with Comfy, but ignored by the TRT engine.
-        - Any extra tensor inputs (e.g. `guidance` for Flux) are taken from
-          **kwargs and matched by name to the engine’s input tensors.
         """
-        # Collect all tensors we might need by name
-        available = {
-            "x": x,
-            "timesteps": timesteps,
-            "context": context,
-        }
-        if y is not None:
-            available["y"] = y
-        # Extra conds (e.g. 'guidance', etc.) may come in via kwargs
-        for name, value in kwargs.items():
-            if isinstance(value, torch.Tensor):
-                available[name] = value
-        # Query engine IO tensors
         tensor_names = [
             self.engine.get_tensor_name(i)
             for i in range(self.engine.num_io_tensors)
@@ -90,87 +88,70 @@ class TrTUnet:
             if self.engine.get_tensor_mode(n) == trt.TensorIOMode.OUTPUT
         ]
-        # Sanity check: we must have a tensor for every input
-        missing = [n for n in input_names if n not in available]
-        if missing:
-            raise RuntimeError(
-                f"Missing tensors for TensorRT engine inputs: {missing}. "
-                f"Available: {list(available.keys())}"
-            )
-        device = x.device
-        # Bind inputs: fix dtype + device, set shapes and addresses
-        for name in input_names:
-            t = available[name]
-            if not t.is_contiguous():
-                t = t.contiguous()
-            # Match engine dtype
-            trt_dtype = self.engine.get_tensor_dtype(name)
-            torch_dtype = trt_datatype_to_torch(trt_dtype)
-            if torch_dtype is None:
                 raise RuntimeError(
-                    f"Unsupported TensorRT dtype {trt_dtype} for input '{name}'"
                 )
-            if t.dtype != torch_dtype:
-                t = t.to(dtype=torch_dtype)
-            if t.device != device:
-                t = t.to(device)
-            # Save back in case we changed it
-            available[name] = t
-            # Tell TRT the runtime shape and bind the memory
             self.context.set_input_shape(name, tuple(t.shape))
             self.context.set_tensor_address(name, t.data_ptr())
-        # Let TRT resolve all dynamic shapes (outputs etc.)
-        unresolved = self.context.infer_shapes()
-        if unresolved:
             raise RuntimeError(
-                f"TensorRT shape inference failed, unresolved tensors: {unresolved}"
             )
         # Allocate and bind outputs
         outputs = []
         for name in output_names:
-            dims = self.context.get_tensor_shape(name)  # trt.Dims
-            # Guard against the old nbDims == -1 issue
-            if hasattr(dims, "nb_dims") and dims.nb_dims < 0:
-                raise RuntimeError(f"Output '{name}' has invalid dims: {dims}")
-            shape = [int(d) for d in dims]
-            trt_dtype = self.engine.get_tensor_dtype(name)
-            torch_dtype = trt_datatype_to_torch(trt_dtype)
-            out = torch.empty(shape, device=device, dtype=torch_dtype)
-            self.context.set_tensor_address(name, out.data_ptr())
-            outputs.append(out)
-        # Run on the default torch CUDA stream
-        stream = torch.cuda.default_stream(device)
-        self.context.execute_async_v3(stream_handle=stream.cuda_stream)
-        # Return single tensor or a tuple
-        if len(outputs) == 1:
-            return outputs[0]
-        return tuple(outputs)
     def load_state_dict(self, sd, strict=False):
-        # Nothing to load for a serialized TensorRT engine
-        pass
     def state_dict(self):
-        # Keep API compatible with nn.Module
         return {}
 class TensorRTLoader:
     @classmethod
     def INPUT_TYPES(s):

 import torch
 import os
 logger = trt.Logger(trt.Logger.INFO)
 runtime = trt.Runtime(logger)
 def trt_datatype_to_torch(datatype):
+    # Works for TRT 8/9/10
+    if datatype in (getattr(trt, "float16", None), getattr(trt.DataType, "HALF", None)):
         return torch.float16
+    if datatype in (getattr(trt, "float32", None), getattr(trt.DataType, "FLOAT", None)):
         return torch.float32
+    if hasattr(trt, "bfloat16") and datatype in (
+        getattr(trt, "bfloat16", None),
+        getattr(trt.DataType, "BF16", None),
+    ):
         return torch.bfloat16
+    if datatype in (getattr(trt, "int32", None), getattr(trt.DataType, "INT32", None)):
+        return torch.int32
+    # Fallback – shouldn't normally hit this for UNets
+    return torch.float32
 class TrTUnet:
     def __init__(self, engine_path):
         with open(engine_path, "rb") as f:
+            engine_bytes = f.read()
+        self.engine = runtime.deserialize_cuda_engine(engine_bytes)
         self.context = self.engine.create_execution_context()
+        # Default precision – overridden to bfloat16 for Flux in TensorRTLoader
         self.dtype = torch.float16
     def __call__(self, x, timesteps, context, y=None,
                  control=None, transformer_options=None, **kwargs):
         """
+        x: [B, C, H, W]
+        timesteps: [B]
+        context: [B, T, Ctxt]
+        y: [B, adm_dim] (SDXL / SD3 / etc.)
+        Other kwargs (control, transformer_options, guidance, ...) are ignored
+        at TensorRT level, but must be accepted to match Comfy's callsite.
         """
+        # Use latent device as canonical device
+        device = x.device
+        # Helper to put everything on the right device / dtype and contiguous
+        def _prep(t):
+            if t is None:
+                return None
+            return t.to(device=device, dtype=self.dtype).contiguous()
+        x = _prep(x)
+        timesteps = _prep(timesteps)
+        context = _prep(context)
+        y = _prep(y)
+        # Discover engine IO tensors
         tensor_names = [
             self.engine.get_tensor_name(i)
             for i in range(self.engine.num_io_tensors)
             if self.engine.get_tensor_mode(n) == trt.TensorIOMode.OUTPUT
         ]
+        # Build a dict of available tensors by name
+        available = {"x": x, "timesteps": timesteps, "context": context}
+        if y is not None:
+            available["y"] = y
+        # Allow passing extra inputs (e.g. "guidance" for Flux) via kwargs
+        for k, v in kwargs.items():
+            if isinstance(v, torch.Tensor):
+                available[k] = _prep(v)
+        # Canonical order, so we never accidentally swap x/timesteps/context/y
+        canonical_order = {"x": 0, "timesteps": 1, "context": 2, "y": 3}
+        input_names_sorted = sorted(
+            input_names,
+            key=lambda n: canonical_order.get(n, 100),
+        )
+        # Bind all inputs – every engine input must get a valid tensor
+        for name in input_names_sorted:
+            if name not in available or available[name] is None:
                 raise RuntimeError(
+                    f"TensorRT engine expects input '{name}' but no tensor was provided."
                 )
+            t = available[name]
             self.context.set_input_shape(name, tuple(t.shape))
             self.context.set_tensor_address(name, t.data_ptr())
+        # Infer shapes (resolve dynamic dims)
+        missing = self.context.infer_shapes()
+        if missing:
             raise RuntimeError(
+                f"TensorRT shape inference failed, unresolved tensors: {missing}"
             )
+        # Ensure the context has enough device memory for the resolved shapes
+        self.context.update_device_memory_size_for_shapes()
         # Allocate and bind outputs
         outputs = []
         for name in output_names:
+            out_dims = self.context.get_tensor_shape(name)
+            out_shape = tuple(int(d) for d in out_dims)
+            out_dtype = trt_datatype_to_torch(self.engine.get_tensor_dtype(name))
+            out_tensor = torch.empty(out_shape, device=device, dtype=out_dtype)
+            self.context.set_tensor_address(name, out_tensor.data_ptr())
+            outputs.append(out_tensor)
+        # Execute on the current PyTorch stream for correct ordering
+        stream = torch.cuda.current_stream(device).cuda_stream
+        self.context.execute_async_v3(stream_handle=stream)
+        # Comfy's apply_model() will call .float() on this anyway
+        return outputs[0] if len(outputs) == 1 else tuple(outputs)
     def load_state_dict(self, sd, strict=False):
+        # No-op – weights are inside the TensorRT engine file.
+        return
     def state_dict(self):
         return {}
 class TensorRTLoader:
     @classmethod
     def INPUT_TYPES(s):