Nvidia-CMU25
/

AutoregressiveBase

AutoModel

custom_code

Model card Files Files and versions

xet

Community

sheldonl commited on Feb 4, 2025

Commit

5c48668

1 Parent(s): 15756c4

Fixed misc.to bug

Browse files

Files changed (2) hide show

misc.py +41 -44
world_generation_pipeline.py +6 -6

misc.py CHANGED Viewed

@@ -30,53 +30,50 @@ import termcolor
 import torch
 from .distributed import get_rank
 class misc():
-    @staticmethod
-    def to(
-        data: Any,
-        device: str | torch.device | None = None,
-        dtype: torch.dtype | None = None,
-        memory_format: torch.memory_format = torch.preserve_format,
-    ) -> Any:
-        """Recursively cast data into the specified device, dtype, and/or memory_format.
-        The input data can be a tensor, a list of tensors, a dict of tensors.
-        See the documentation for torch.Tensor.to() for details.
-        Args:
-            data (Any): Input data.
-            device (str | torch.device): GPU device (default: None).
-            dtype (torch.dtype): data type (default: None).
-            memory_format (torch.memory_format): memory organization format (default: torch.preserve_format).
-        Returns:
-            data (Any): Data cast to the specified device, dtype, and/or memory_format.
-        """
-        assert (
-            device is not None or dtype is not None or memory_format is not None
-        ), "at least one of device, dtype, memory_format should be specified"
-        if isinstance(data, torch.Tensor):
-            is_cpu = (isinstance(device, str) and device == "cpu") or (
-                isinstance(device, torch.device) and device.type == "cpu"
-            )
-            data = data.to(
-                device=device,
-                dtype=dtype,
-                memory_format=memory_format,
-                non_blocking=(not is_cpu),
-            )
-            return data
-        elif isinstance(data, collections.abc.Mapping):
-            return type(data)({key: to(data[key], device=device, dtype=dtype, memory_format=memory_format) for key in data})
-        elif isinstance(data, collections.abc.Sequence) and not isinstance(data, (str, bytes)):
-            return type(data)([to(elem, device=device, dtype=dtype, memory_format=memory_format) for elem in data])
-        else:
-            return data
     @staticmethod
     def serialize(data: Any) -> Any:
         """Serialize data by hierarchically traversing through iterables.

 import torch
 from .distributed import get_rank
+def to(
+    data: Any,
+    device: str | torch.device | None = None,
+    dtype: torch.dtype | None = None,
+    memory_format: torch.memory_format = torch.preserve_format,
+) -> Any:
+    """Recursively cast data into the specified device, dtype, and/or memory_format.
+    The input data can be a tensor, a list of tensors, a dict of tensors.
+    See the documentation for torch.Tensor.to() for details.
+    Args:
+        data (Any): Input data.
+        device (str | torch.device): GPU device (default: None).
+        dtype (torch.dtype): data type (default: None).
+        memory_format (torch.memory_format): memory organization format (default: torch.preserve_format).
+    Returns:
+        data (Any): Data cast to the specified device, dtype, and/or memory_format.
+    """
+    assert (
+        device is not None or dtype is not None or memory_format is not None
+    ), "at least one of device, dtype, memory_format should be specified"
+    if isinstance(data, torch.Tensor):
+        is_cpu = (isinstance(device, str) and device == "cpu") or (
+            isinstance(device, torch.device) and device.type == "cpu"
+        )
+        data = data.to(
+            device=device,
+            dtype=dtype,
+            memory_format=memory_format,
+            non_blocking=(not is_cpu),
+        )
+        return data
+    elif isinstance(data, collections.abc.Mapping):
+        return type(data)({key: to(data[key], device=device, dtype=dtype, memory_format=memory_format) for key in data})
+    elif isinstance(data, collections.abc.Sequence) and not isinstance(data, (str, bytes)):
+        return type(data)([to(elem, device=device, dtype=dtype, memory_format=memory_format) for elem in data])
+    else:
+        return data
 class misc():
     @staticmethod
     def serialize(data: Any) -> Any:
         """Serialize data by hierarchically traversing through iterables.

world_generation_pipeline.py CHANGED Viewed

@@ -40,7 +40,7 @@ from .inference_utils import (
     load_tokenizer_model,
 )
 from .log import log
-from .misc import misc
 def detect_model_size_from_ckpt_path(ckpt_path: str) -> str:
@@ -311,7 +311,7 @@ class ARBaseGenerationPipeline(BaseWorldGenerationPipeline):
         log.info(f"Using input size of {context_used} frames")
         data_batch = {"video": inp_vid}
-        data_batch = misc.to(data_batch, "cuda")
         T, H, W = self.latent_shape
         num_gen_tokens = int(np.prod([T - latent_context_t_size, H, W]))
@@ -508,13 +508,13 @@ class ARBaseGenerationPipeline(BaseWorldGenerationPipeline):
         context = data_batch.get("context", None) if task_condition != "video" else None
         context_mask = data_batch.get("context_mask", None) if task_condition != "video" else None
         if context is not None:
-            context = misc.to(context, "cuda").detach().clone()
         if context_mask is not None:
-            context_mask = misc.to(context_mask, "cuda").detach().clone()
         # get the video tokens
         data_tokens, token_boundaries = self.model.tokenizer.tokenize(data_batch=data_batch)
-        data_tokens = misc.to(data_tokens, "cuda").detach().clone()
         batch_size = data_tokens.shape[0]
         for sample_num in range(batch_size):
@@ -816,7 +816,7 @@ class ARVideo2WorldGenerationPipeline(ARBaseGenerationPipeline):
         data_batch["video"] = inp_vid
         data_batch["video"] = data_batch["video"].repeat(batch_size, 1, 1, 1, 1)
-        data_batch = misc.to(data_batch, "cuda")
         log.debug(f"  num_tokens_to_generate: {num_gen_tokens}")
         log.debug(f"  sampling_config: {sampling_config}")

     load_tokenizer_model,
 )
 from .log import log
+from .misc import misc, to
 def detect_model_size_from_ckpt_path(ckpt_path: str) -> str:
         log.info(f"Using input size of {context_used} frames")
         data_batch = {"video": inp_vid}
+        data_batch = to(data_batch, "cuda")
         T, H, W = self.latent_shape
         num_gen_tokens = int(np.prod([T - latent_context_t_size, H, W]))
         context = data_batch.get("context", None) if task_condition != "video" else None
         context_mask = data_batch.get("context_mask", None) if task_condition != "video" else None
         if context is not None:
+            context = to(context, "cuda").detach().clone()
         if context_mask is not None:
+            context_mask = to(context_mask, "cuda").detach().clone()
         # get the video tokens
         data_tokens, token_boundaries = self.model.tokenizer.tokenize(data_batch=data_batch)
+        data_tokens = to(data_tokens, "cuda").detach().clone()
         batch_size = data_tokens.shape[0]
         for sample_num in range(batch_size):
         data_batch["video"] = inp_vid
         data_batch["video"] = data_batch["video"].repeat(batch_size, 1, 1, 1, 1)
+        data_batch = to(data_batch, "cuda")
         log.debug(f"  num_tokens_to_generate: {num_gen_tokens}")
         log.debug(f"  sampling_config: {sampling_config}")