saliacoel
/

MyCustomNodes

ONNX

Safetensors

depth_anything

Model card Files Files and versions

xet

Community

saliacoel commited on Dec 10, 2025

Commit

7c128cf

verified ·

1 Parent(s): 91e9495

Upload tensorrt_convert.py

Browse files

Files changed (1) hide show

tensorrt_convert.py +325 -122

tensorrt_convert.py CHANGED Viewed

@@ -1,17 +1,52 @@
-import torch
-import sys
 import os
 import time
 import comfy.model_management
 import tensorrt as trt
 import folder_paths
 from tqdm import tqdm
-# TODO:
-# Make it more generic: less model specific code
-# add output directory to tensorrt search path
 if "tensorrt" in folder_paths.folder_names_and_paths:
     folder_paths.folder_names_and_paths["tensorrt"][0].append(
         os.path.join(folder_paths.get_output_directory(), "tensorrt")
@@ -23,6 +58,10 @@ else:
         {".engine"},
     )
 class TQDMProgressMonitor(trt.IProgressMonitor):
     def __init__(self):
         trt.IProgressMonitor.__init__(self)
@@ -53,8 +92,9 @@ class TQDMProgressMonitor(trt.IProgressMonitor):
                 "parent_phase": parent_phase,
             }
         except KeyboardInterrupt:
-            # The phase_start callback cannot directly cancel the build, so request the cancellation from within step_complete.
-            _step_result = False
     def phase_finish(self, phase_name):
         try:
@@ -78,9 +118,8 @@ class TQDMProgressMonitor(trt.IProgressMonitor):
                         self._active_phases[phase_name]["parent_phase"]
                     ]["tq"].refresh()
                 del self._active_phases[phase_name]
-            pass
         except KeyboardInterrupt:
-            _step_result = False
     def step_complete(self, phase_name, step):
         try:
@@ -90,16 +129,22 @@ class TQDMProgressMonitor(trt.IProgressMonitor):
                 )
             return self._step_result
         except KeyboardInterrupt:
-            # There is no need to propagate this exception to TensorRT. We can simply cancel the build.
             return False
 class TRT_MODEL_CONVERSION_BASE:
     def __init__(self):
         self.output_dir = folder_paths.get_output_directory()
         self.temp_dir = folder_paths.get_temp_directory()
         self.timing_cache_path = os.path.normpath(
-            os.path.join(os.path.join(os.path.dirname(os.path.realpath(__file__)), "timing_cache.trt"))
         )
     RETURN_TYPES = ()
@@ -117,9 +162,9 @@ class TRT_MODEL_CONVERSION_BASE:
         if os.path.exists(self.timing_cache_path):
             with open(self.timing_cache_path, mode="rb") as timing_cache_file:
                 buffer = timing_cache_file.read()
-            print("Read {} bytes from timing cache.".format(len(buffer)))
         else:
-            print("No timing cache found; Initializing a new one.")
         timing_cache: trt.ITimingCache = config.create_timing_cache(buffer)
         config.set_timing_cache(timing_cache, ignore_mismatch=True)
@@ -127,7 +172,9 @@ class TRT_MODEL_CONVERSION_BASE:
     def _save_timing_cache(self, config: trt.IBuilderConfig):
         timing_cache: trt.ITimingCache = config.get_timing_cache()
         with open(self.timing_cache_path, "wb") as timing_cache_file:
-            timing_cache_file.write(memoryview(timing_cache.serialize()))
     def _convert(
         self,
@@ -148,15 +195,39 @@ class TRT_MODEL_CONVERSION_BASE:
         num_video_frames,
         is_static: bool,
     ):
         output_onnx = os.path.normpath(
             os.path.join(
                 os.path.join(self.temp_dir, "{}".format(time.time())), "model.onnx"
             )
         )
         comfy.model_management.unload_all_models()
-        comfy.model_management.load_models_gpu([model], force_patch_weights=True, force_full_load=True)
         unet = model.model.diffusion_model
         context_dim = model.model.model_config.unet_config.get("context_dim", None)
         context_len = 77
@@ -165,149 +236,265 @@ class TRT_MODEL_CONVERSION_BASE:
         extra_input = {}
         dtype = torch.float16
-        if isinstance(model.model, comfy.model_base.SD3): #SD3
-            context_embedder_config = model.model.model_config.unet_config.get("context_embedder_config", None)
             if context_embedder_config is not None:
-                context_dim = context_embedder_config.get("params", {}).get("in_features", None)
-                context_len = 154 #NOTE: SD3 can have 77 or 154 depending on which text encoders are used, this is why context_len_min stays 77
         elif isinstance(model.model, comfy.model_base.AuraFlow):
             context_dim = 2048
             context_len_min = 256
             context_len = 256
         elif isinstance(model.model, comfy.model_base.Flux):
-            context_dim = model.model.model_config.unet_config.get("context_in_dim", None)
             context_len_min = 256
             context_len = 256
             y_dim = model.model.model_config.unet_config.get("vec_in_dim", None)
             extra_input = {"guidance": ()}
             dtype = torch.bfloat16
-        if context_dim is not None:
-            input_names = ["x", "timesteps", "context"]
-            output_names = ["h"]
-            dynamic_axes = {
-                "x": {0: "batch", 2: "height", 3: "width"},
-                "timesteps": {0: "batch"},
-                "context": {0: "batch", 1: "num_embeds"},
-            }
-            transformer_options = model.model_options['transformer_options'].copy()
-            if model.model.model_config.unet_config.get(
-                "use_temporal_resblock", False
-            ):  # SVD
-                batch_size_min = num_video_frames * batch_size_min
-                batch_size_opt = num_video_frames * batch_size_opt
-                batch_size_max = num_video_frames * batch_size_max
-                class UNET(torch.nn.Module):
-                    def forward(self, x, timesteps, context, y):
-                        return self.unet(
-                            x,
-                            timesteps,
-                            context,
-                            y,
-                            num_video_frames=self.num_video_frames,
-                            transformer_options=self.transformer_options,
-                        )
-                svd_unet = UNET()
-                svd_unet.num_video_frames = num_video_frames
-                svd_unet.unet = unet
-                svd_unet.transformer_options = transformer_options
-                unet = svd_unet
-                context_len_min = context_len = 1
-            else:
-                class UNET(torch.nn.Module):
-                    def forward(self, x, timesteps, context, *args):
-                        extras = input_names[3:]
-                        extra_args = {}
-                        for i in range(len(extras)):
-                            extra_args[extras[i]] = args[i]
-                        return self.unet(x, timesteps, context, transformer_options=self.transformer_options, **extra_args)
-                _unet = UNET()
-                _unet.unet = unet
-                _unet.transformer_options = transformer_options
-                unet = _unet
-            input_channels = model.model.model_config.unet_config.get("in_channels", 4)
-            inputs_shapes_min = (
-                (batch_size_min, input_channels, height_min // 8, width_min // 8),
-                (batch_size_min,),
-                (batch_size_min, context_len_min * context_min, context_dim),
             )
-            inputs_shapes_opt = (
-                (batch_size_opt, input_channels, height_opt // 8, width_opt // 8),
-                (batch_size_opt,),
-                (batch_size_opt, context_len * context_opt, context_dim),
             )
-            inputs_shapes_max = (
-                (batch_size_max, input_channels, height_max // 8, width_max // 8),
-                (batch_size_max,),
-                (batch_size_max, context_len * context_max, context_dim),
             )
-            if y_dim > 0:
-                input_names.append("y")
-                dynamic_axes["y"] = {0: "batch"}
-                inputs_shapes_min += ((batch_size_min, y_dim),)
-                inputs_shapes_opt += ((batch_size_opt, y_dim),)
-                inputs_shapes_max += ((batch_size_max, y_dim),)
-            for k in extra_input:
-                input_names.append(k)
-                dynamic_axes[k] = {0: "batch"}
-                inputs_shapes_min += ((batch_size_min,) + extra_input[k],)
-                inputs_shapes_opt += ((batch_size_opt,) + extra_input[k],)
-                inputs_shapes_max += ((batch_size_max,) + extra_input[k],)
-            inputs = ()
-            for shape in inputs_shapes_opt:
-                inputs += (
-                    torch.zeros(
-                        shape,
-                        device=comfy.model_management.get_torch_device(),
-                        dtype=dtype,
-                    ),
-                )
-        else:
-            print("ERROR: model not supported.")
-            return ()
         os.makedirs(os.path.dirname(output_onnx), exist_ok=True)
-        torch.onnx.export(
-            unet,
-            inputs,
-            output_onnx,
-            verbose=False,
-            input_names=input_names,
-            output_names=output_names,
-            opset_version=17,
-            dynamic_axes=False,
-            dynamo=False,
         )
         comfy.model_management.unload_all_models()
         comfy.model_management.soft_empty_cache()
-        # TRT conversion starts here
         logger = trt.Logger(trt.Logger.INFO)
         builder = trt.Builder(logger)
         network = builder.create_network(
             1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
         )
         parser = trt.OnnxParser(network, logger)
         success = parser.parse_from_file(output_onnx)
         for idx in range(parser.num_errors):
             print(parser.get_error(idx))
         if not success:
-            print("ONNX load ERROR")
             return ()
         config = builder.create_builder_config()
@@ -315,22 +502,28 @@ class TRT_MODEL_CONVERSION_BASE:
         self._setup_timing_cache(config)
         config.progress_monitor = TQDMProgressMonitor()
         prefix_encode = ""
         for k in range(len(input_names)):
             min_shape = inputs_shapes_min[k]
             opt_shape = inputs_shapes_opt[k]
             max_shape = inputs_shapes_max[k]
             profile.set_shape(input_names[k], min_shape, opt_shape, max_shape)
             # Encode shapes to filename
-            encode = lambda a: ".".join(map(lambda x: str(x), a))
             prefix_encode += "{}#{}#{}#{};".format(
                 input_names[k], encode(min_shape), encode(opt_shape), encode(max_shape)
             )
         if dtype == torch.float16:
             config.set_flag(trt.BuilderFlag.FP16)
         if dtype == torch.bfloat16:
             config.set_flag(trt.BuilderFlag.BF16)
         config.add_optimization_profile(profile)
@@ -372,7 +565,11 @@ class TRT_MODEL_CONVERSION_BASE:
                 ),
             )
         serialized_engine = builder.build_serialized_network(network, config)
         full_output_folder, filename, counter, subfolder, filename_prefix = (
             folder_paths.get_save_image_path(filename_prefix, self.output_dir)
@@ -381,14 +578,20 @@ class TRT_MODEL_CONVERSION_BASE:
             full_output_folder, f"{filename}_{counter:05}_.engine"
         )
         with open(output_trt_engine, "wb") as f:
             f.write(serialized_engine)
         self._save_timing_cache(config)
         return ()
 class DYNAMIC_TRT_MODEL_CONVERSION(TRT_MODEL_CONVERSION_BASE):
     def __init__(self):
         super(DYNAMIC_TRT_MODEL_CONVERSION, self).__init__()

 import os
+import sys
 import time
+import torch
 import comfy.model_management
 import tensorrt as trt
 import folder_paths
 from tqdm import tqdm
+# -------------------------------------------------------------------------
+# torch.export dynamic shapes support
+# -------------------------------------------------------------------------
+try:
+    from torch.export import Dim
+except Exception as e:
+    raise RuntimeError(
+        "[TensorRTExport] torch.export.Dim not available. "
+        "Please upgrade PyTorch to >= 2.1 / 2.5+ to use the Dynamo-based "
+        "ONNX exporter with dynamic shapes."
+    ) from e
+def trtlog(msg: str):
+    print(f"[TensorRTExport] {msg}", flush=True)
+# Opset handling:
+#   - If COMFY_TRT_ONNX_OPSET is set, use that integer.
+#   - Otherwise, leave opset_version=None so torch.onnx uses the
+#     recommended opset for this PyTorch version (e.g. 20 on 2.9).
+DEFAULT_ONNX_OPSET = None
+_env_opset = os.getenv("COMFY_TRT_ONNX_OPSET")
+if _env_opset is not None:
+    try:
+        DEFAULT_ONNX_OPSET = int(_env_opset)
+        trtlog(f"Using opset_version from COMFY_TRT_ONNX_OPSET={DEFAULT_ONNX_OPSET}")
+    except ValueError:
+        trtlog(
+            f"WARNING: invalid COMFY_TRT_ONNX_OPSET={_env_opset!r}, "
+            "falling back to PyTorch recommended opset (None)."
+        )
+        DEFAULT_ONNX_OPSET = None
+# -------------------------------------------------------------------------
+# Add output directory to TensorRT search path (ComfyUI integration)
+# -------------------------------------------------------------------------
 if "tensorrt" in folder_paths.folder_names_and_paths:
     folder_paths.folder_names_and_paths["tensorrt"][0].append(
         os.path.join(folder_paths.get_output_directory(), "tensorrt")
         {".engine"},
     )
+# -------------------------------------------------------------------------
+# Progress monitor for TensorRT builds
+# -------------------------------------------------------------------------
 class TQDMProgressMonitor(trt.IProgressMonitor):
     def __init__(self):
         trt.IProgressMonitor.__init__(self)
                 "parent_phase": parent_phase,
             }
         except KeyboardInterrupt:
+            # The phase_start callback cannot directly cancel the build,
+            # so request the cancellation from within step_complete.
+            self._step_result = False
     def phase_finish(self, phase_name):
         try:
                         self._active_phases[phase_name]["parent_phase"]
                     ]["tq"].refresh()
                 del self._active_phases[phase_name]
         except KeyboardInterrupt:
+            self._step_result = False
     def step_complete(self, phase_name, step):
         try:
                 )
             return self._step_result
         except KeyboardInterrupt:
+            # There is no need to propagate this exception to TensorRT.
+            # We can simply cancel the build.
             return False
+# -------------------------------------------------------------------------
+# Base class for ONNX -> TensorRT conversion
+# -------------------------------------------------------------------------
 class TRT_MODEL_CONVERSION_BASE:
     def __init__(self):
         self.output_dir = folder_paths.get_output_directory()
         self.temp_dir = folder_paths.get_temp_directory()
         self.timing_cache_path = os.path.normpath(
+            os.path.join(
+                os.path.dirname(os.path.realpath(__file__)), "timing_cache.trt"
+            )
         )
     RETURN_TYPES = ()
         if os.path.exists(self.timing_cache_path):
             with open(self.timing_cache_path, mode="rb") as timing_cache_file:
                 buffer = timing_cache_file.read()
+            trtlog(f"Read {len(buffer)} bytes from timing cache.")
         else:
+            trtlog("No timing cache found; initializing a new one.")
         timing_cache: trt.ITimingCache = config.create_timing_cache(buffer)
         config.set_timing_cache(timing_cache, ignore_mismatch=True)
     def _save_timing_cache(self, config: trt.IBuilderConfig):
         timing_cache: trt.ITimingCache = config.get_timing_cache()
         with open(self.timing_cache_path, "wb") as timing_cache_file:
+            serialized = timing_cache.serialize()
+            timing_cache_file.write(memoryview(serialized))
+        trtlog(f"Timing cache saved to {self.timing_cache_path}")
     def _convert(
         self,
         num_video_frames,
         is_static: bool,
     ):
+        # -----------------------------------------------------------------
+        # Basic logging: versions & configuration
+        # -----------------------------------------------------------------
+        trtlog(
+            f"PyTorch version: {torch.__version__}, TensorRT version: {trt.__version__}"
+        )
+        trtlog(
+            f"Requested {'STATIC' if is_static else 'DYNAMIC'} TensorRT engine "
+            f"(b=[{batch_size_min},{batch_size_opt},{batch_size_max}], "
+            f"h=[{height_min},{height_opt},{height_max}], "
+            f"w=[{width_min},{width_opt},{width_max}], "
+            f"context=[{context_min},{context_opt},{context_max}], "
+            f"num_video_frames={num_video_frames})"
+        )
         output_onnx = os.path.normpath(
             os.path.join(
                 os.path.join(self.temp_dir, "{}".format(time.time())), "model.onnx"
             )
         )
+        trtlog(f"Temporary ONNX path: {output_onnx}")
+        # -----------------------------------------------------------------
+        # Load model to GPU
+        # -----------------------------------------------------------------
         comfy.model_management.unload_all_models()
+        comfy.model_management.load_models_gpu(
+            [model], force_patch_weights=True, force_full_load=True
+        )
         unet = model.model.diffusion_model
+        model_type = type(model.model).__name__
+        trtlog(f"Detected model type: {model_type}")
         context_dim = model.model.model_config.unet_config.get("context_dim", None)
         context_len = 77
         extra_input = {}
         dtype = torch.float16
+        # -----------------------------------------------------------------
+        # Model-type specific tweaks
+        # -----------------------------------------------------------------
+        if isinstance(model.model, comfy.model_base.SD3):  # SD3
+            context_embedder_config = model.model.model_config.unet_config.get(
+                "context_embedder_config", None
+            )
             if context_embedder_config is not None:
+                context_dim = context_embedder_config.get(
+                    "params", {}
+                ).get("in_features", None)
+                # SD3 can have 77 or 154 depending on TE usage
+                context_len = 154
+                trtlog(f"SD3 context_dim={context_dim}, context_len={context_len}")
         elif isinstance(model.model, comfy.model_base.AuraFlow):
             context_dim = 2048
             context_len_min = 256
             context_len = 256
+            trtlog(
+                f"AuraFlow context_dim={context_dim}, "
+                f"context_len_min={context_len_min}, context_len={context_len}"
+            )
         elif isinstance(model.model, comfy.model_base.Flux):
+            context_dim = model.model.model_config.unet_config.get(
+                "context_in_dim", None
+            )
             context_len_min = 256
             context_len = 256
             y_dim = model.model.model_config.unet_config.get("vec_in_dim", None)
             extra_input = {"guidance": ()}
             dtype = torch.bfloat16
+            trtlog(
+                f"Flux context_dim={context_dim}, y_dim={y_dim}, "
+                f"context_len_min={context_len_min}, context_len={context_len}, "
+                f"extra_input={list(extra_input.keys())}, dtype={dtype}"
+            )
+        if context_dim is None:
+            print("ERROR: model not supported (no context_dim).")
+            comfy.model_management.unload_all_models()
+            comfy.model_management.soft_empty_cache()
+            return ()
+        input_names = ["x", "timesteps", "context"]
+        output_names = ["h"]
+        transformer_options = model.model_options["transformer_options"].copy()
+        use_temporal = model.model.model_config.unet_config.get(
+            "use_temporal_resblock", False
+        )
+        # -----------------------------------------------------------------
+        # Wrap UNet so argument names are stable for dynamic_shapes
+        # -----------------------------------------------------------------
+        if use_temporal:  # SVD
+            trtlog("Model uses temporal resblock (SVD-like). Adjusting batch sizes.")
+            batch_size_min = num_video_frames * batch_size_min
+            batch_size_opt = num_video_frames * batch_size_opt
+            batch_size_max = num_video_frames * batch_size_max
+            class SVD_UNET(torch.nn.Module):
+                def __init__(self, unet, transformer_options, num_video_frames):
+                    super().__init__()
+                    self.unet = unet
+                    self.transformer_options = transformer_options
+                    self.num_video_frames = num_video_frames
+                def forward(self, x, timesteps, context, y):
+                    return self.unet(
+                        x,
+                        timesteps,
+                        context,
+                        y,
+                        num_video_frames=self.num_video_frames,
+                        transformer_options=self.transformer_options,
+                    )
+            unet = SVD_UNET(unet, transformer_options, num_video_frames)
+            context_len_min = context_len = 1
+            trtlog(
+                f"SVD adjusted batch: "
+                f"b=[{batch_size_min},{batch_size_opt},{batch_size_max}], "
+                f"context_len_min={context_len_min}, context_len={context_len}"
             )
+        else:
+            # Generic wrapper with named extras (y, guidance)
+            extra_keys = list(extra_input.keys())
+            class UNET(torch.nn.Module):
+                def __init__(self, unet, transformer_options, y_dim, extra_keys):
+                    super().__init__()
+                    self.unet = unet
+                    self.transformer_options = transformer_options
+                    self.y_dim = y_dim
+                    self.extra_keys = extra_keys
+                def forward(self, x, timesteps, context, y=None, guidance=None):
+                    extra_args = {}
+                    if self.y_dim is not None and self.y_dim > 0 and y is not None:
+                        extra_args["y"] = y
+                    if "guidance" in self.extra_keys and guidance is not None:
+                        extra_args["guidance"] = guidance
+                    return self.unet(
+                        x,
+                        timesteps,
+                        context,
+                        transformer_options=self.transformer_options,
+                        **extra_args,
+                    )
+            unet = UNET(unet, transformer_options, y_dim, extra_keys)
+        # -----------------------------------------------------------------
+        # Compute input shapes (min / opt / max)
+        # -----------------------------------------------------------------
+        input_channels = model.model.model_config.unet_config.get("in_channels", 4)
+        inputs_shapes_min = (
+            (batch_size_min, input_channels, height_min // 8, width_min // 8),
+            (batch_size_min,),
+            (batch_size_min, context_len_min * context_min, context_dim),
+        )
+        inputs_shapes_opt = (
+            (batch_size_opt, input_channels, height_opt // 8, width_opt // 8),
+            (batch_size_opt,),
+            (batch_size_opt, context_len * context_opt, context_dim),
+        )
+        inputs_shapes_max = (
+            (batch_size_max, input_channels, height_max // 8, width_max // 8),
+            (batch_size_max,),
+            (batch_size_max, context_len * context_max, context_dim),
+        )
+        if y_dim is not None and y_dim > 0:
+            input_names.append("y")
+            inputs_shapes_min += ((batch_size_min, y_dim),)
+            inputs_shapes_opt += ((batch_size_opt, y_dim),)
+            inputs_shapes_max += ((batch_size_max, y_dim),)
+        # Extra inputs (currently used for Flux guidance)
+        for k in extra_input:
+            input_names.append(k)
+            shape_suffix = extra_input[k]  # e.g. () for scalar per batch
+            inputs_shapes_min += ((batch_size_min,) + shape_suffix,)
+            inputs_shapes_opt += ((batch_size_opt,) + shape_suffix,)
+            inputs_shapes_max += ((batch_size_max,) + shape_suffix,)
+        # Clamp context ranges sanely if the UI somehow passed inverted min/max
+        if context_max < context_min:
+            trtlog(
+                f"WARNING: context_max({context_max}) < context_min({context_min}), swapping."
             )
+            context_min, context_max = context_max, context_min
+        trtlog("Input names: " + ", ".join(input_names))
+        for idx, name in enumerate(input_names):
+            trtlog(
+                f"  {name}: "
+                f"min={inputs_shapes_min[idx]}, "
+                f"opt={inputs_shapes_opt[idx]}, "
+                f"max={inputs_shapes_max[idx]}"
             )
+        # -----------------------------------------------------------------
+        # Build dynamic_shapes spec for torch.export / dynamo=True
+        # -----------------------------------------------------------------
+        B = Dim("batch", min=batch_size_min, max=batch_size_max)
+        H = Dim("height", min=height_min // 8, max=height_max // 8)
+        W = Dim("width", min=width_min // 8, max=width_max // 8)
+        T = Dim(
+            "tokens",
+            min=context_len_min * context_min,
+            max=context_len * context_max,
+        )
+        dynamic_shapes = {
+            "x": {0: B, 2: H, 3: W},
+            "timesteps": {0: B},
+            "context": {0: B, 1: T},
+        }
+        if "y" in input_names:
+            dynamic_shapes["y"] = {0: B}
+        if "guidance" in input_names:
+            dynamic_shapes["guidance"] = {0: B}
+        trtlog(f"dynamic_shapes spec: {dynamic_shapes}")
+        # -----------------------------------------------------------------
+        # Build example inputs (using OPT shapes)
+        # -----------------------------------------------------------------
+        inputs = ()
+        for shape in inputs_shapes_opt:
+            inputs += (
+                torch.zeros(
+                    shape,
+                    device=comfy.model_management.get_torch_device(),
+                    dtype=dtype,
+                ),
+            )
+        # -----------------------------------------------------------------
+        # ONNX export with Dynamo + dynamic_shapes
+        # -----------------------------------------------------------------
         os.makedirs(os.path.dirname(output_onnx), exist_ok=True)
+        trtlog(
+            f"Exporting UNet to ONNX with dynamo=True, "
+            f"opset_version={DEFAULT_ONNX_OPSET}, dtype={dtype}, "
+            f"output={output_onnx}"
         )
+        try:
+            torch.onnx.export(
+                unet,
+                inputs,
+                output_onnx,
+                verbose=False,
+                input_names=input_names,
+                output_names=output_names,
+                opset_version=DEFAULT_ONNX_OPSET,
+                dynamo=True,
+                dynamic_shapes=dynamic_shapes,
+                # NOTE:
+                #   - We intentionally do NOT pass dynamic_axes here.
+                #     dynamic_axes is for the legacy TorchScript exporter,
+                #     dynamic_shapes + dynamo=True is the modern path.
+            )
+            trtlog("torch.onnx.export completed successfully.")
+        except Exception as e:
+            trtlog(f"ERROR during torch.onnx.export: {e}")
+            # Clean up GPU state before re-raising
+            comfy.model_management.unload_all_models()
+            comfy.model_management.soft_empty_cache()
+            raise
         comfy.model_management.unload_all_models()
         comfy.model_management.soft_empty_cache()
+        # -----------------------------------------------------------------
+        # TensorRT conversion starts here
+        # -----------------------------------------------------------------
         logger = trt.Logger(trt.Logger.INFO)
         builder = trt.Builder(logger)
+        trtlog("Created TensorRT builder.")
         network = builder.create_network(
             1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
         )
         parser = trt.OnnxParser(network, logger)
+        trtlog(f"Parsing ONNX file: {output_onnx}")
         success = parser.parse_from_file(output_onnx)
         for idx in range(parser.num_errors):
             print(parser.get_error(idx))
         if not success:
+            print("ONNX load ERROR (TensorRT parser.parse_from_file returned False).")
             return ()
         config = builder.create_builder_config()
         self._setup_timing_cache(config)
         config.progress_monitor = TQDMProgressMonitor()
+        trtlog("Creating optimization profile:")
         prefix_encode = ""
         for k in range(len(input_names)):
             min_shape = inputs_shapes_min[k]
             opt_shape = inputs_shapes_opt[k]
             max_shape = inputs_shapes_max[k]
+            trtlog(
+                f"  {input_names[k]}: min={min_shape}, opt={opt_shape}, max={max_shape}"
+            )
             profile.set_shape(input_names[k], min_shape, opt_shape, max_shape)
             # Encode shapes to filename
+            encode = lambda a: ".".join(map(str, a))
             prefix_encode += "{}#{}#{}#{};".format(
                 input_names[k], encode(min_shape), encode(opt_shape), encode(max_shape)
             )
         if dtype == torch.float16:
+            trtlog("Enabling FP16 mode in TensorRT builder config.")
             config.set_flag(trt.BuilderFlag.FP16)
         if dtype == torch.bfloat16:
+            trtlog("Enabling BF16 mode in TensorRT builder config.")
             config.set_flag(trt.BuilderFlag.BF16)
         config.add_optimization_profile(profile)
                 ),
             )
+        trtlog("Building serialized TensorRT engine. This may take a while...")
         serialized_engine = builder.build_serialized_network(network, config)
+        if serialized_engine is None:
+            trtlog("ERROR: builder.build_serialized_network returned None.")
+            return ()
         full_output_folder, filename, counter, subfolder, filename_prefix = (
             folder_paths.get_save_image_path(filename_prefix, self.output_dir)
             full_output_folder, f"{filename}_{counter:05}_.engine"
         )
+        trtlog(f"Writing TensorRT engine to: {output_trt_engine}")
+        os.makedirs(full_output_folder, exist_ok=True)
         with open(output_trt_engine, "wb") as f:
             f.write(serialized_engine)
         self._save_timing_cache(config)
+        trtlog("TensorRT conversion complete.")
         return ()
+# -------------------------------------------------------------------------
+# Dynamic / Static wrapper nodes
+# -------------------------------------------------------------------------
 class DYNAMIC_TRT_MODEL_CONVERSION(TRT_MODEL_CONVERSION_BASE):
     def __init__(self):
         super(DYNAMIC_TRT_MODEL_CONVERSION, self).__init__()