saliacoel
/

MyCustomNodes

ONNX

Safetensors

depth_anything

Model card Files Files and versions

xet

Community

saliacoel commited on Dec 10, 2025

Commit

1306c71

verified ·

1 Parent(s): 18518ac

Upload tensorrt_loader.py

Browse files

Files changed (1) hide show

tensorrt_loader.py +114 -49

tensorrt_loader.py CHANGED Viewed

@@ -40,72 +40,137 @@ class TrTUnet:
         with open(engine_path, "rb") as f:
             self.engine = runtime.deserialize_cuda_engine(f.read())
         self.context = self.engine.create_execution_context()
         self.dtype = torch.float16
     def set_bindings_shape(self, inputs, split_batch):
         for k in inputs:
             shape = inputs[k].shape
             shape = [shape[0] // split_batch] + list(shape[1:])
             self.context.set_input_shape(k, shape)
-def __call__(self, x, timesteps, context, y=None, **kwargs):
-    # Ensure input types match engine precision (e.g., FP16)
-    if x.dtype != self.dtype:
-        x = x.to(dtype=self.dtype)
-        timesteps = timesteps.to(dtype=self.dtype)
-        context = context.to(dtype=self.dtype)
         if y is not None:
-            y = y.to(dtype=self.dtype)
-    # Prepare model inputs list
-    model_inputs = [x, timesteps, context]
-    if y is not None:
-        model_inputs.append(y)
-    # Set dynamic input shapes for the execution context
-    tensor_names = [self.engine.get_tensor_name(i) for i in range(self.engine.num_io_tensors)]
-    # Identify input and output names using TensorRT I/O mode
-    input_names  = [n for n in tensor_names if self.engine.get_tensor_mode(n) == trt.TensorIOMode.INPUT]
-    output_names = [n for n in tensor_names if self.engine.get_tensor_mode(n) == trt.TensorIOMode.OUTPUT]
-    # Ensure we have a matching number of input names and provided tensors
-    if len(input_names) != len(model_inputs):
-        raise RuntimeError(f"Expected {len(input_names)} inputs for TensorRT engine, but got {len(model_inputs)}.")
-    # Set input shapes and addresses
-    for name, tensor in zip(input_names, model_inputs):
-        shape = tuple(tensor.shape)
-        self.context.set_input_shape(name, shape)        # specify runtime shape for dynamic dims
-        self.context.set_tensor_address(name, tensor.data_ptr())  # bind input memory
-    # Infer shapes (ensures all dynamic dims are resolved)
-    missing = self.context.infer_shapes()
-    if missing:  # if any tensor shapes still unspecified, something is wrong
-        raise RuntimeError(f"TensorRT shape inference failed, unresolved tensors: {missing}")
-    # Allocate outputs with proper shapes
-    outputs = []
-    for name in output_names:
-        out_dims = self.context.get_tensor_shape(name)   # get resolved output shape (trt.Dims)
-        out_shape = [int(d) for d in out_dims]           # convert Dims to list of ints
-        out_tensor = torch.empty(out_shape, device=self.torch_device, dtype=self.torch_dtype)
-        self.context.set_tensor_address(name, out_tensor.data_ptr())  # bind output memory
-        outputs.append(out_tensor)
-    # Execute the engine (on default CUDA stream or a pre-created stream)
-    self.context.execute_async_v3(stream_handle=0)  # using default stream (0) for simplicity
-    # If only one output tensor, return it directly for convenience
-    return outputs[0] if len(outputs) == 1 else tuple(outputs)
     def load_state_dict(self, sd, strict=False):
         pass
     def state_dict(self):
         return {}
 class TensorRTLoader:
     @classmethod
     def INPUT_TYPES(s):

         with open(engine_path, "rb") as f:
             self.engine = runtime.deserialize_cuda_engine(f.read())
         self.context = self.engine.create_execution_context()
+        # default dtype in case something doesn't have a specific TRT dtype
         self.dtype = torch.float16
     def set_bindings_shape(self, inputs, split_batch):
+        # still here in case something else calls it, but the new __call__
+        # no longer uses this split-batch path
         for k in inputs:
             shape = inputs[k].shape
             shape = [shape[0] // split_batch] + list(shape[1:])
             self.context.set_input_shape(k, shape)
+    def __call__(self, x, timesteps, context, y=None,
+                 control=None, transformer_options=None, **kwargs):
+        """
+        Run the TensorRT UNet.
+        - `control` and `transformer_options` are accepted for API compatibility
+          with Comfy, but ignored by the TRT engine.
+        - Any extra tensor inputs (e.g. `guidance` for Flux) are taken from
+          **kwargs and matched by name to the engine’s input tensors.
+        """
+        # Collect all tensors we might need by name
+        available = {
+            "x": x,
+            "timesteps": timesteps,
+            "context": context,
+        }
         if y is not None:
+            available["y"] = y
+        # Extra conds (e.g. 'guidance', etc.) may come in via kwargs
+        for name, value in kwargs.items():
+            if isinstance(value, torch.Tensor):
+                available[name] = value
+        # Query engine IO tensors
+        tensor_names = [
+            self.engine.get_tensor_name(i)
+            for i in range(self.engine.num_io_tensors)
+        ]
+        input_names = [
+            n for n in tensor_names
+            if self.engine.get_tensor_mode(n) == trt.TensorIOMode.INPUT
+        ]
+        output_names = [
+            n for n in tensor_names
+            if self.engine.get_tensor_mode(n) == trt.TensorIOMode.OUTPUT
+        ]
+        # Sanity check: we must have a tensor for every input
+        missing = [n for n in input_names if n not in available]
+        if missing:
+            raise RuntimeError(
+                f"Missing tensors for TensorRT engine inputs: {missing}. "
+                f"Available: {list(available.keys())}"
+            )
+        device = x.device
+        # Bind inputs: fix dtype + device, set shapes and addresses
+        for name in input_names:
+            t = available[name]
+            if not t.is_contiguous():
+                t = t.contiguous()
+            # Match engine dtype
+            trt_dtype = self.engine.get_tensor_dtype(name)
+            torch_dtype = trt_datatype_to_torch(trt_dtype)
+            if torch_dtype is None:
+                raise RuntimeError(
+                    f"Unsupported TensorRT dtype {trt_dtype} for input '{name}'"
+                )
+            if t.dtype != torch_dtype:
+                t = t.to(dtype=torch_dtype)
+            if t.device != device:
+                t = t.to(device)
+            # Save back in case we changed it
+            available[name] = t
+            # Tell TRT the runtime shape and bind the memory
+            self.context.set_input_shape(name, tuple(t.shape))
+            self.context.set_tensor_address(name, t.data_ptr())
+        # Let TRT resolve all dynamic shapes (outputs etc.)
+        unresolved = self.context.infer_shapes()
+        if unresolved:
+            raise RuntimeError(
+                f"TensorRT shape inference failed, unresolved tensors: {unresolved}"
+            )
+        # Allocate and bind outputs
+        outputs = []
+        for name in output_names:
+            dims = self.context.get_tensor_shape(name)  # trt.Dims
+            # Guard against the old nbDims == -1 issue
+            if hasattr(dims, "nb_dims") and dims.nb_dims < 0:
+                raise RuntimeError(f"Output '{name}' has invalid dims: {dims}")
+            shape = [int(d) for d in dims]
+            trt_dtype = self.engine.get_tensor_dtype(name)
+            torch_dtype = trt_datatype_to_torch(trt_dtype)
+            out = torch.empty(shape, device=device, dtype=torch_dtype)
+            self.context.set_tensor_address(name, out.data_ptr())
+            outputs.append(out)
+        # Run on the default torch CUDA stream
+        stream = torch.cuda.default_stream(device)
+        self.context.execute_async_v3(stream_handle=stream.cuda_stream)
+        # Return single tensor or a tuple
+        if len(outputs) == 1:
+            return outputs[0]
+        return tuple(outputs)
     def load_state_dict(self, sd, strict=False):
+        # Nothing to load for a serialized TensorRT engine
         pass
     def state_dict(self):
+        # Keep API compatible with nn.Module
         return {}
 class TensorRTLoader:
     @classmethod
     def INPUT_TYPES(s):