saliacoel
/

MyCustomNodes

ONNX

Safetensors

depth_anything

Model card Files Files and versions

xet

Community

saliacoel commited on Dec 10, 2025

Commit

e24ff1f

verified ·

1 Parent(s): c6054f8

Upload 2 files

Browse files

Files changed (2) hide show

tensorrt_convert.py +1 -0
tensorrt_loader.py +91 -85

tensorrt_convert.py CHANGED Viewed

@@ -287,6 +287,7 @@ class TRT_MODEL_CONVERSION_BASE:
             input_names=input_names,
             output_names=output_names,
             opset_version=17,
             dynamo=False,  # <— force legacy ONNX exporter, no torch.export/dynamic_shapes
         )

             input_names=input_names,
             output_names=output_names,
             opset_version=17,
+            dynamic_axes=dynamic_axes,  # KEEP dynamic axes
             dynamo=False,  # <— force legacy ONNX exporter, no torch.export/dynamic_shapes
         )

tensorrt_loader.py CHANGED Viewed

@@ -43,108 +43,113 @@ def trt_datatype_to_torch(datatype):
 class TrTUnet:
     def __init__(self, engine_path):
         with open(engine_path, "rb") as f:
-            engine_bytes = f.read()
-        self.engine = runtime.deserialize_cuda_engine(engine_bytes)
         self.context = self.engine.create_execution_context()
-        # Default precision – overridden to bfloat16 for Flux in TensorRTLoader
-        self.dtype = torch.float16
-    def __call__(self, x, timesteps, context, y=None,
-                 control=None, transformer_options=None, **kwargs):
         """
-        x: [B, C, H, W]
-        timesteps: [B]
-        context: [B, T, Ctxt]
-        y: [B, adm_dim] (SDXL / SD3 / etc.)
-        Other kwargs (control, transformer_options, guidance, ...) are ignored
-        at TensorRT level, but must be accepted to match Comfy's callsite.
         """
-        # Use latent device as canonical device
-        device = x.device
-        # Helper to put everything on the right device / dtype and contiguous
-        def _prep(t):
-            if t is None:
-                return None
-            return t.to(device=device, dtype=self.dtype).contiguous()
-        x = _prep(x)
-        timesteps = _prep(timesteps)
-        context = _prep(context)
-        y = _prep(y)
-        # Discover engine IO tensors
-        tensor_names = [
-            self.engine.get_tensor_name(i)
-            for i in range(self.engine.num_io_tensors)
-        ]
-        input_names = [
-            n for n in tensor_names
-            if self.engine.get_tensor_mode(n) == trt.TensorIOMode.INPUT
-        ]
-        output_names = [
-            n for n in tensor_names
-            if self.engine.get_tensor_mode(n) == trt.TensorIOMode.OUTPUT
-        ]
-        # Build a dict of available tensors by name
-        available = {"x": x, "timesteps": timesteps, "context": context}
         if y is not None:
-            available["y"] = y
-        # Allow passing extra inputs (e.g. "guidance" for Flux) via kwargs
-        for k, v in kwargs.items():
-            if isinstance(v, torch.Tensor):
-                available[k] = _prep(v)
-        # Canonical order, so we never accidentally swap x/timesteps/context/y
-        canonical_order = {"x": 0, "timesteps": 1, "context": 2, "y": 3}
-        input_names_sorted = sorted(
-            input_names,
-            key=lambda n: canonical_order.get(n, 100),
-        )
-        # Bind all inputs – every engine input must get a valid tensor
-        for name in input_names_sorted:
-            if name not in available or available[name] is None:
-                raise RuntimeError(
-                    f"TensorRT engine expects input '{name}' but no tensor was provided."
-                )
-            t = available[name]
             self.context.set_input_shape(name, tuple(t.shape))
-            self.context.set_tensor_address(name, t.data_ptr())
-        # Infer shapes (resolve dynamic dims)
         missing = self.context.infer_shapes()
         if missing:
-            raise RuntimeError(
-                f"TensorRT shape inference failed, unresolved tensors: {missing}"
-            )
-        # Ensure the context has enough device memory for the resolved shapes
-        self.context.update_device_memory_size_for_shapes()
-        # Allocate and bind outputs
-        outputs = []
         for name in output_names:
-            out_dims = self.context.get_tensor_shape(name)
             out_shape = tuple(int(d) for d in out_dims)
-            out_dtype = trt_datatype_to_torch(self.engine.get_tensor_dtype(name))
-            out_tensor = torch.empty(out_shape, device=device, dtype=out_dtype)
-            self.context.set_tensor_address(name, out_tensor.data_ptr())
-            outputs.append(out_tensor)
-        # Execute on the current PyTorch stream for correct ordering
-        stream = torch.cuda.current_stream(device).cuda_stream
-        self.context.execute_async_v3(stream_handle=stream)
-        # Comfy's apply_model() will call .float() on this anyway
-        return outputs[0] if len(outputs) == 1 else tuple(outputs)
     def load_state_dict(self, sd, strict=False):
-        # No-op – weights are inside the TensorRT engine file.
-        return
     def state_dict(self):
         return {}
@@ -152,6 +157,7 @@ class TrTUnet:
 class TensorRTLoader:
     @classmethod
     def INPUT_TYPES(s):

 class TrTUnet:
     def __init__(self, engine_path):
         with open(engine_path, "rb") as f:
+            self.engine = runtime.deserialize_cuda_engine(f.read())
         self.context = self.engine.create_execution_context()
+        # Default torch device / dtype for allocations
+        self.device = comfy.model_management.get_torch_device()
+        self.default_dtype = torch.float16  # fallback if something unknown shows up
+    def _trt_dtype_to_torch(self, trt_dtype):
+        dt = trt_datatype_to_torch(trt_dtype)
+        return dt if dt is not None else self.default_dtype
+    def __call__(self, x, timesteps, context, y=None, control=None, transformer_options=None, **kwargs):
         """
+        x          : [B, C, H, W]
+        timesteps  : [B]
+        context    : [B, N, D]
+        y          : [B, y_dim]   (optional, SDXL etc.)
         """
+        # -----------------------------
+        # 1. Build dict of actual inputs
+        # -----------------------------
+        model_inputs = {
+            "x": x,
+            "timesteps": timesteps,
+            "context": context,
+        }
         if y is not None:
+            model_inputs["y"] = y
+        # If your engine has extra inputs (e.g. 'guidance' for Flux),
+        # they must either come from kwargs or be absent from the engine.
+        tensor_names = [self.engine.get_tensor_name(i) for i in range(self.engine.num_io_tensors)]
+        input_names  = [n for n in tensor_names if self.engine.get_tensor_mode(n) == trt.TensorIOMode.INPUT]
+        output_names = [n for n in tensor_names if self.engine.get_tensor_mode(n) == trt.TensorIOMode.OUTPUT]
+        # Fill missing inputs from kwargs if present
+        for name in input_names:
+            if name in model_inputs:
+                continue
+            if name in kwargs:
+                model_inputs[name] = kwargs[name]
+        if len(model_inputs) != len(input_names):
+            missing = [n for n in input_names if n not in model_inputs]
+            raise RuntimeError(
+                f"TensorRT UNet: missing required inputs for engine: {missing} "
+                f"(have {list(model_inputs.keys())})"
+            )
+        # -----------------------------
+        # 2. Convert each input to engine dtype + bind it
+        # -----------------------------
+        for name in input_names:
+            t = model_inputs[name]
+            # Move to correct device
+            if t.device != self.device:
+                t = t.to(self.device)
+            # Match TensorRT's expected dtype for this tensor
+            trt_dtype = self.engine.get_tensor_dtype(name)
+            torch_dtype = self._trt_dtype_to_torch(trt_dtype)
+            if t.dtype != torch_dtype:
+                t = t.to(dtype=torch_dtype)
+            # Update back (so later code sees the converted tensor if needed)
+            model_inputs[name] = t
+            # Set runtime shape and bind memory
             self.context.set_input_shape(name, tuple(t.shape))
+            self.context.set_tensor_address(name, int(t.data_ptr()))
+        # Make sure all shapes are resolved
         missing = self.context.infer_shapes()
         if missing:
+            raise RuntimeError(f"TensorRT shape inference failed, unresolved tensors: {missing}")
+        # -----------------------------
+        # 3. Allocate & bind outputs
+        # -----------------------------
+        outputs = {}
         for name in output_names:
+            out_dims = self.context.get_tensor_shape(name)  # trt.Dims
             out_shape = tuple(int(d) for d in out_dims)
+            trt_dtype = self.engine.get_tensor_dtype(name)
+            torch_dtype = self._trt_dtype_to_torch(trt_dtype)
+            out_tensor = torch.empty(out_shape, device=self.device, dtype=torch_dtype)
+            self.context.set_tensor_address(name, int(out_tensor.data_ptr()))
+            outputs[name] = out_tensor
+        # -----------------------------
+        # 4. Execute on the current torch CUDA stream
+        # -----------------------------
+        stream = torch.cuda.current_stream(self.device)
+        self.context.execute_async_v3(stream_handle=stream.cuda_stream)
+        # No need to sync explicitly; ComfyUI uses the same default stream.
+        # Return outputs in a stable order
+        out_list = [outputs[name] for name in output_names]
+        return out_list[0] if len(out_list) == 1 else tuple(out_list)
     def load_state_dict(self, sd, strict=False):
+        pass
     def state_dict(self):
         return {}
 class TensorRTLoader:
     @classmethod
     def INPUT_TYPES(s):