tchoudha21 commited on Jan 29, 2025

Commit

5a33b3a

verified ·

1 Parent(s): 0660fbf

Upload modified files

Browse files

Files changed (17) hide show

base_world_generation_pipeline.py +362 -0
conditioner.py +323 -0
config.json +10 -0
convert_pixtral_ckpt.py +209 -0
download_diffusion.py +123 -0
guardrail_presets.py +77 -0
inference_utils.py +726 -0
model_t2w.py +282 -0
model_v2w.py +341 -0
t5_text_encoder.py +108 -0
text2world.py +161 -0
text2world_prompt_upsampler_inference.py +157 -0
types.py +28 -0
video2world.py +179 -0
video2world_hf.py +283 -0
video2world_prompt_upsampler_inference.py +167 -0
world_generation_pipeline.py +658 -0

base_world_generation_pipeline.py ADDED Viewed

	@@ -0,0 +1,362 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import gc
+import os
+from abc import ABC
+from typing import Any
+import numpy as np
+import torch
+from Cosmos.t5_text_encoder import CosmosT5TextEncoder
+from Cosmos import guardrail_presets as guardrail_presets
+class BaseWorldGenerationPipeline(ABC):
+    def __init__(
+        self,
+        inference_type: str | None = None,
+        checkpoint_dir: str | None = None,
+        checkpoint_name: str | None = None,
+        enable_text_guardrail: bool = False,
+        enable_video_guardrail: bool = False,
+        offload_network: bool = False,
+        offload_tokenizer: bool = False,
+        offload_text_encoder_model: bool = False,
+        offload_guardrail_models: bool = False,
+    ):
+        """Initialize base world generation pipeline.
+        This abstract base class provides core functionality for world generation models including:
+        - Model loading and initialization
+        - Text encoding and embedding
+        - Safety checks and content filtering
+        - Memory management through model offloading
+        Args:
+            inference_type: The type of inference pipeline ("text2world" or "video2world")
+            checkpoint_dir: Root directory containing model checkpoints
+            checkpoint_name: Name of the specific checkpoint file to load
+            enable_text_guardrail: If True, validates input prompts for safety
+            enable_video_guardrail: If True, validates generated videos for safety
+            offload_network: If True, moves main model to CPU after inference
+            offload_tokenizer: If True, moves tokenizer to CPU after use
+            offload_text_encoder_model: If True, moves T5 encoder to CPU after encoding
+            offload_guardrail_models: If True, moves safety models to CPU after checks
+        """
+        self.inference_type = inference_type
+        self.checkpoint_dir = checkpoint_dir
+        self.checkpoint_name = checkpoint_name
+        self.guardrail_dir = "Cosmos-1.0-Guardrail"
+        self.enable_text_guardrail = enable_text_guardrail
+        self.enable_video_guardrail = enable_video_guardrail
+        # Add offloading flags
+        self.offload_network = offload_network
+        self.offload_tokenizer = offload_tokenizer
+        self.offload_text_encoder_model = offload_text_encoder_model
+        self.offload_guardrail_models = offload_guardrail_models
+        # Initialize model instances
+        self.text_guardrail = None
+        self.video_guardrail = None
+        self.text_encoder = None
+        self.model = None
+        self._load_model()
+        if not self.offload_text_encoder_model:
+            self._load_text_encoder_model()
+        if not self.offload_guardrail_models:
+            if self.enable_text_guardrail:
+                self._load_text_guardrail()
+            if self.enable_video_guardrail:
+                self._load_video_guardrail()
+        if not self.offload_network:
+            self._load_network()
+        if not self.offload_tokenizer:
+            self._load_tokenizer()
+    def _load_tokenizer(self):
+        pass
+    def _load_network(self):
+        pass
+    def _load_model(self, checkpoint_name: str) -> Any:
+        """Load the world generation model from a checkpoint.
+        This abstract method must be implemented by subclasses to load their specific
+        model architecture and weights.
+        Args:
+            checkpoint_name: Path to the model checkpoint file
+        Returns:
+            The loaded model instance
+        Raises:
+            NotImplementedError: Must be implemented by subclasses
+        """
+        pass
+    def _load_text_encoder_model(self):
+        """Load the T5 text encoder model.
+        Initializes and loads the T5 encoder model used for converting text prompts
+        into embeddings that condition the world generation model.
+        Returns:
+            Loaded T5 text encoder model instance
+        """
+        self.text_encoder = CosmosT5TextEncoder(cache_dir=self.checkpoint_dir)
+    def _load_text_guardrail(self):
+        """Load text safety classifier models.
+        Initializes models used for checking input prompts against safety policies.
+        Models are loaded from the specified guardrail directory.
+        """
+        self.text_guardrail = guardrail_presets.create_text_guardrail_runner(
+            checkpoint_dir=os.path.join(self.checkpoint_dir, self.guardrail_dir)
+        )
+    def _load_video_guardrail(self):
+        """Load video safety classifier models.
+        Initializes models used for validating generated video content against
+        safety policies. Models are loaded from the specified guardrail directory.
+        """
+        self.video_guardrail = guardrail_presets.create_video_guardrail_runner(
+            checkpoint_dir=os.path.join(self.checkpoint_dir, self.guardrail_dir)
+        )
+    def _offload_network(self):
+        if self.model.model:
+            del self.model.model
+            self.model.model = None
+            gc.collect()
+            torch.cuda.empty_cache()
+    def _offload_tokenizer(self):
+        if self.model.tokenizer:
+            del self.model.tokenizer
+            self.model.tokenizer = None
+            gc.collect()
+            torch.cuda.empty_cache()
+    def _offload_guardrail_models(self):
+        """Offload safety classifier models to reduce memory usage.
+        Moves safety models to CPU and clears GPU memory if they are no longer needed.
+        This helps manage memory when processing multiple inputs sequentially.
+        """
+        if self.text_guardrail:
+            del self.text_guardrail
+            self.text_guardrail = None
+        if self.video_guardrail:
+            del self.video_guardrail
+            self.video_guardrail = None
+        gc.collect()
+        torch.cuda.empty_cache()
+    def _offload_text_encoder_model(self):
+        """Offload T5 text encoder to reduce memory usage.
+        Moves the T5 encoder to CPU and clears GPU memory after text encoding is complete.
+        This helps manage memory when processing multiple inputs sequentially.
+        """
+        if self.text_encoder:
+            del self.text_encoder
+            self.text_encoder = None
+            gc.collect()
+            torch.cuda.empty_cache()
+    def _run_model(self, *args: Any, **kwargs: Any) -> torch.Tensor:
+        """Generate world latents using the model.
+        This abstract method must be implemented by subclasses to define their specific
+        generation process.
+        Args:
+            *args: Variable positional arguments for model inference
+            **kwargs: Variable keyword arguments for model inference
+        Returns:
+            torch.Tensor: Generated world representation tensor
+        """
+        pass
+    def _run_model_with_offload(self, *args: Any, **kwargs: Any) -> torch.Tensor:
+        """Generate world representation with memory management.
+        Handles loading the model before inference and offloading afterward if enabled.
+        This helps minimize GPU memory usage during inference.
+        Args:
+            *args: Arguments passed to _run_model
+            **kwargs: Keyword arguments passed to _run_model
+        Returns:
+            np.ndarray: Generated world representation as numpy array
+        """
+        pass
+    def _run_guardrail_on_prompt(self, prompt: str) -> bool:
+        """Check if prompt meets safety requirements.
+        Validates the input prompt against safety policies using loaded guardrail models.
+        Args:
+            prompt: Raw text prompt to validate
+        Returns:
+            bool: True if prompt passes all safety checks, False otherwise
+        """
+        return guardrail_presets.run_text_guardrail(prompt, self.text_guardrail)
+    def _run_guardrail_on_prompt_with_offload(self, prompt: str) -> bool:
+        """Check prompt safety with memory management.
+        Validates prompt safety while handling model loading/offloading to manage memory.
+        Args:
+            prompt: Raw text prompt to validate
+        Returns:
+            bool: True if prompt passes all safety checks, False otherwise
+        """
+        if self.offload_guardrail_models:
+            self._load_text_guardrail()
+        is_safe = self._run_guardrail_on_prompt(prompt)
+        if self.offload_guardrail_models:
+            self._offload_guardrail_models()
+        return is_safe
+    def _run_guardrail_on_video(self, video: np.ndarray) -> np.ndarray | None:
+        """Check if video meets safety requirements.
+        Validates generated video content against safety policies using guardrail models.
+        Args:
+            video: Video frames to validate
+        Returns:
+            np.ndarray: Processed video if safe, None if unsafe
+        """
+        return guardrail_presets.run_video_guardrail(video, self.video_guardrail)
+    def _run_guardrail_on_video_with_offload(self, video: np.ndarray) -> np.ndarray | None:
+        """Check if generated video meets safety requirements.
+        Args:
+            video: Video frames to validate
+        Returns:
+            np.ndarray: Processed video frames if safe, None otherwise
+        Note:
+            Guardrail models are offloaded after checks if enabled.
+        """
+        if self.offload_guardrail_models:
+            self._load_video_guardrail()
+        video = self._run_guardrail_on_video(video)
+        if self.offload_guardrail_models:
+            self._offload_guardrail_models()
+        return video
+    def _run_text_embedding_on_prompt(
+        self, prompts: list[str], **kwargs: Any
+    ) -> tuple[list[torch.Tensor], list[torch.Tensor]]:
+        """Convert text prompts to embeddings.
+        Processes text prompts into embedding tensors that condition the generation model.
+        Args:
+            prompts: List of text prompts to encode
+            **kwargs: Additional arguments for text encoding
+        Returns:
+            tuple containing:
+                - List of text embedding tensors for each prompt
+                - List of attention masks for each embedding
+        """
+        embeddings = []
+        masks = []
+        for prompt in prompts:
+            embedding, mask = self.text_encoder.encode_prompts(
+                [prompt],
+                **kwargs,
+            )
+            embeddings.append(embedding)
+            masks.append(mask)
+        return embeddings, masks
+    def _run_text_embedding_on_prompt_with_offload(
+        self, prompts: list[str], **kwargs: Any
+    ) -> tuple[list[torch.Tensor], list[torch.Tensor]]:
+        """Convert text prompt into embeddings using T5 encoder.
+        Args:
+            prompt: Processed and validated text prompt
+        Returns:
+            Text embedding tensor to condition diffusion model
+        Note:
+            T5 model is offloaded after encoding if enabled.
+        """
+        if self.offload_text_encoder_model:
+            self._load_text_encoder_model()
+        embeddings, masks = self._run_text_embedding_on_prompt(prompts, **kwargs)
+        if self.offload_text_encoder_model:
+            self._offload_text_encoder_model()
+        return embeddings, masks
+    def _run_tokenizer_decoding(self, samples: torch.Tensor) -> np.ndarray:
+        """Decode model outputs into final world representation.
+        This abstract method must be implemented by subclasses to convert raw model
+        outputs into their specific world representation format.
+        Args:
+            samples: Raw output tensor from the generation model
+        Returns:
+            np.ndarray: Decoded world representation
+        """
+        pass
+    def generate(self, *args: Any, **kwargs: Any):
+        """Generate world representation.
+        This abstract method must be implemented by subclasses to convert raw model
+        outputs into their specific world representation format.
+        Args:
+            *args: Variable positional arguments for model inference
+            **kwargs: Variable keyword arguments for model inference
+        """
+        pass

conditioner.py ADDED Viewed

	@@ -0,0 +1,323 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+from abc import ABC, abstractmethod
+from collections import defaultdict
+from dataclasses import dataclass, fields
+from enum import Enum
+from typing import Any, Dict, List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+from cosmos1.models.diffusion.diffusion.functional.batch_ops import batch_mul
+from Cosmos.utils import log
+from Cosmos.lazy_config import instantiate
+class BaseConditionEntry(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self._dropout_rate = None
+        self._input_key = None
+        self._return_dict = False
+    @property
+    def dropout_rate(self) -> Union[float, torch.Tensor]:
+        return self._dropout_rate
+    @property
+    def input_key(self) -> str:
+        return self._input_key
+    @property
+    def is_return_dict(self) -> bool:
+        return self._return_dict
+    @dropout_rate.setter
+    def dropout_rate(self, value: Union[float, torch.Tensor]):
+        self._dropout_rate = value
+    @input_key.setter
+    def input_key(self, value: str):
+        self._input_key = value
+    @is_return_dict.setter
+    def is_return_dict(self, value: bool):
+        self._return_dict = value
+    @dropout_rate.deleter
+    def dropout_rate(self):
+        del self._dropout_rate
+    @input_key.deleter
+    def input_key(self):
+        del self._input_key
+    @is_return_dict.deleter
+    def is_return_dict(self):
+        del self._return_dict
+    def random_dropout_input(
+        self, in_tensor: torch.Tensor, dropout_rate: Optional[float] = None, key: Optional[str] = None
+    ) -> torch.Tensor:
+        del key
+        dropout_rate = dropout_rate if dropout_rate is not None else self.dropout_rate
+        return batch_mul(
+            torch.bernoulli((1.0 - dropout_rate) * torch.ones(in_tensor.shape[0])).type_as(in_tensor),
+            in_tensor,
+        )
+    def summary(self) -> str:
+        pass
+class DataType(Enum):
+    IMAGE = "image"
+    VIDEO = "video"
+class TextAttr(BaseConditionEntry):
+    def __init__(self):
+        super().__init__()
+    def forward(self, token: torch.Tensor, mask: torch.Tensor):
+        return {"crossattn_emb": token, "crossattn_mask": mask}
+    def random_dropout_input(
+        self, in_tensor: torch.Tensor, dropout_rate: Optional[float] = None, key: Optional[str] = None
+    ) -> torch.Tensor:
+        if key is not None and "mask" in key:
+            return in_tensor
+        return super().random_dropout_input(in_tensor, dropout_rate, key)
+@dataclass
+class BaseVideoCondition:
+    crossattn_emb: torch.Tensor
+    crossattn_mask: torch.Tensor
+    data_type: DataType = DataType.VIDEO
+    padding_mask: Optional[torch.Tensor] = None
+    fps: Optional[torch.Tensor] = None
+    num_frames: Optional[torch.Tensor] = None
+    image_size: Optional[torch.Tensor] = None
+    scalar_feature: Optional[torch.Tensor] = None
+    def to_dict(self) -> Dict[str, Optional[torch.Tensor]]:
+        return {f.name: getattr(self, f.name) for f in fields(self)}
+@dataclass
+class VideoExtendCondition(BaseVideoCondition):
+    video_cond_bool: Optional[torch.Tensor] = None  # whether or not it conditioned on video
+    gt_latent: Optional[torch.Tensor] = None
+    condition_video_indicator: Optional[torch.Tensor] = None  # 1 for condition region
+    # condition_video_input_mask will concat to the input of network, along channel dim;
+    # Will be concat with the input tensor
+    condition_video_input_mask: Optional[torch.Tensor] = None
+    # condition_video_augment_sigma: (B, T) tensor of sigma value for the conditional input augmentation, only valid when apply_corruption_to_condition_region is "noise_with_sigma" or "noise_with_sigma_fixed"
+    condition_video_augment_sigma: Optional[torch.Tensor] = None
+class GeneralConditioner(nn.Module, ABC):
+    """
+    An abstract module designed to handle various embedding models with conditional and
+    unconditional configurations. This abstract base class initializes and manages a collection
+    of embedders that can dynamically adjust their dropout rates based on conditioning.
+    Attributes:
+        KEY2DIM (dict): A mapping from output keys to dimensions used for concatenation.
+        embedders (nn.ModuleDict): A dictionary containing all embedded models initialized and
+            configured based on the provided configurations.
+    Parameters:
+        emb_models (Union[List, Any]): A dictionary where keys are embedder names and values
+            are configurations for initializing the embedders.
+    """
+    KEY2DIM = {"crossattn_emb": 1, "crossattn_mask": 1}
+    def __init__(self, **emb_models: Union[List, Any]):
+        super().__init__()
+        self.embedders = nn.ModuleDict()
+        for n, (emb_name, embconfig) in enumerate(emb_models.items()):
+            embedder = instantiate(embconfig.obj)
+            assert isinstance(
+                embedder, BaseConditionEntry
+            ), f"embedder model {embedder.__class__.__name__} has to inherit from AbstractEmbModel"
+            embedder.dropout_rate = getattr(embconfig, "dropout_rate", 0.0)
+            if hasattr(embconfig, "input_key"):
+                embedder.input_key = embconfig.input_key
+            elif hasattr(embconfig, "input_keys"):
+                embedder.input_keys = embconfig.input_keys
+            else:
+                raise KeyError(f"need either 'input_key' or 'input_keys' for embedder {embedder.__class__.__name__}")
+            log.debug(f"Initialized embedder #{n}-{emb_name}: \n {embedder.summary()}")
+            self.embedders[emb_name] = embedder
+    @abstractmethod
+    def forward(
+        self,
+        batch: Dict,
+        override_dropout_rate: Optional[Dict[str, float]] = None,
+    ) -> Any:
+        """Should be implemented in subclasses to handle conditon datatype"""
+        raise NotImplementedError
+    def _forward(
+        self,
+        batch: Dict,
+        override_dropout_rate: Optional[Dict[str, float]] = None,
+    ) -> Dict:
+        """
+        Processes the input batch through all configured embedders, applying conditional dropout rates if specified.
+        Output tensors for each key are concatenated along the dimensions specified in KEY2DIM.
+        Parameters:
+            batch (Dict): The input data batch to process.
+            override_dropout_rate (Optional[Dict[str, float]]): Optional dictionary to override default dropout rates
+                                                                per embedder key.
+        Returns:
+            Dict: A dictionary of output tensors concatenated by specified dimensions.
+        Note:
+            In case the network code is sensitive to the order of concatenation, you can either control the order via \
+            config file or make sure the embedders return a unique key for each output.
+        """
+        output = defaultdict(list)
+        if override_dropout_rate is None:
+            override_dropout_rate = {}
+        # make sure emb_name in override_dropout_rate is valid
+        for emb_name in override_dropout_rate.keys():
+            assert emb_name in self.embedders, f"invalid name found {emb_name}"
+        for emb_name, embedder in self.embedders.items():
+            with torch.no_grad():
+                if hasattr(embedder, "input_key") and (embedder.input_key is not None):
+                    emb_out = embedder(
+                        embedder.random_dropout_input(
+                            batch[embedder.input_key], override_dropout_rate.get(emb_name, None)
+                        )
+                    )
+                elif hasattr(embedder, "input_keys"):
+                    emb_out = embedder(
+                        *[
+                            embedder.random_dropout_input(batch[k], override_dropout_rate.get(emb_name, None), k)
+                            for k in embedder.input_keys
+                        ]
+                    )
+            for k, v in emb_out.items():
+                output[k].append(v)
+        # Concatenate the outputs
+        return {k: torch.cat(v, dim=self.KEY2DIM.get(k, -1)) for k, v in output.items()}
+    def get_condition_uncondition(
+        self,
+        data_batch: Dict,
+    ) -> Tuple[Any, Any]:
+        """
+        Processes the provided data batch to generate conditioned and unconditioned outputs.
+        This method manipulates dropout rates to simulate two scenarios:
+        1. All conditions applied (conditioned)
+        2. Conditions removed/reduced to minimum (unconditioned)
+        This method sets dropout rates to zero for the conditioned scenario to fully apply
+        embedders' effects. For unconditioned, it sets rates to 1 (or 0 if initial rate is
+        insignificant) to minimize embedder influences.
+        Parameters:
+            data_batch (Dict): Input data batch containing all necessary information for
+                              embedding processing.
+        Returns:
+            Tuple[Any, Any]: A tuple containing:
+                - Outputs with all embedders fully applied (conditioned)
+                - Outputs with embedders minimized/not applied (unconditioned)
+        """
+        cond_dropout_rates, dropout_rates = {}, {}
+        for emb_name, embedder in self.embedders.items():
+            cond_dropout_rates[emb_name] = 0.0
+            dropout_rates[emb_name] = 1.0 if embedder.dropout_rate > 1e-4 else 0.0
+        condition: Any = self(data_batch, override_dropout_rate=cond_dropout_rates)
+        un_condition: Any = self(data_batch, override_dropout_rate=dropout_rates)
+        return condition, un_condition
+    def get_condition_with_negative_prompt(
+        self,
+        data_batch: Dict,
+    ) -> Tuple[Any, Any]:
+        """
+        Similar functionality as get_condition_uncondition
+        But use negative prompts for unconditon
+        """
+        cond_dropout_rates, uncond_dropout_rates = {}, {}
+        for emb_name, embedder in self.embedders.items():
+            cond_dropout_rates[emb_name] = 0.0
+            if isinstance(embedder, TextAttr):
+                uncond_dropout_rates[emb_name] = 0.0
+            else:
+                uncond_dropout_rates[emb_name] = 1.0 if embedder.dropout_rate > 1e-4 else 0.0
+        data_batch_neg_prompt = copy.deepcopy(data_batch)
+        if "neg_t5_text_embeddings" in data_batch_neg_prompt:
+            if isinstance(data_batch_neg_prompt["neg_t5_text_embeddings"], torch.Tensor):
+                data_batch_neg_prompt["t5_text_embeddings"] = data_batch_neg_prompt["neg_t5_text_embeddings"]
+                data_batch_neg_prompt["t5_text_mask"] = data_batch_neg_prompt["neg_t5_text_mask"]
+        condition: Any = self(data_batch, override_dropout_rate=cond_dropout_rates)
+        un_condition: Any = self(data_batch_neg_prompt, override_dropout_rate=uncond_dropout_rates)
+        return condition, un_condition
+@dataclass
+class CosmosCondition:
+    crossattn_emb: torch.Tensor
+    crossattn_mask: torch.Tensor
+    padding_mask: Optional[torch.Tensor] = None
+    scalar_feature: Optional[torch.Tensor] = None
+    def to_dict(self) -> Dict[str, Optional[torch.Tensor]]:
+        return {f.name: getattr(self, f.name) for f in fields(self)}
+class VideoConditioner(GeneralConditioner):
+    def forward(
+        self,
+        batch: Dict,
+        override_dropout_rate: Optional[Dict[str, float]] = None,
+    ) -> BaseVideoCondition:
+        output = super()._forward(batch, override_dropout_rate)
+        return BaseVideoCondition(**output)
+class VideoExtendConditioner(GeneralConditioner):
+    def forward(
+        self,
+        batch: Dict,
+        override_dropout_rate: Optional[Dict[str, float]] = None,
+    ) -> VideoExtendCondition:
+        output = super()._forward(batch, override_dropout_rate)
+        return VideoExtendCondition(**output)

config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+    "architectures": [
+      "DiffusionVideo2World"
+    ],
+    "auto_map": {
+      "AutoConfig": "video2world_hf.DiffusionVideo2WorldConfig",
+      "AutoModel": "video2world_hf.DiffusionVideo2World"
+    },
+    "model_type": "AutoModel"
+}

convert_pixtral_ckpt.py ADDED Viewed

	@@ -0,0 +1,209 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert pretrained Pixtral vision model weights to checkpoint and verify the checkpoint loading.
+    Usage:
+    PYTHONPATH=$(pwd) python cosmos1/scripts/convert_pixtral_ckpt.py
+"""
+import argparse
+import json
+import os
+import shutil
+from glob import glob
+import torch
+from huggingface_hub import snapshot_download
+from safetensors.torch import load_file
+def convert_pixtral_checkpoint(checkpoint_dir: str, checkpoint_name: str, vit_type: str):
+    """
+    Main function to convert Pixtral vision model weights to checkpoint and optionally verify and save the converted checkpoint.
+    Args:
+        checkpoint_dir (str): Path to the checkpoint directory
+        checkpoint_name (str): Name of the checkpoint
+        vit_type (str): Type of ViT used in the Pixtral model
+    This function performs the following steps:
+    0. Download the checkpoint from Hugging Face
+    1. Loads the original Pixtral checkpoint
+    2. Splits the checkpoint into vision encoder, projector, and LLM weights
+    3. Reorganizes the weights to match the expected format
+    4. Extracts and verifies the vision encoder configuration
+    5. Optionally verifies the converted checkpoint by loading it into a VisionTransformer
+    6. Optionally saves the converted checkpoint and configuration
+    """
+    save_dir = os.path.join(checkpoint_dir, checkpoint_name)
+    os.makedirs(save_dir, exist_ok=True)
+    # Save the converted checkpoint
+    save_path = os.path.join(save_dir, "model.pt")
+    if os.path.exists(save_path) and os.path.getsize(save_path) > 0:
+        print(f"Checkpoint {save_path} already exists and is not empty")
+        return
+    pixtral_ckpt_dir = os.path.join(checkpoint_dir, "Pixtral-12B-2409")
+    os.makedirs(pixtral_ckpt_dir, exist_ok=True)
+    repo_id = "mistralai/Pixtral-12B-2409"
+    print(f"Downloading {repo_id} to {pixtral_ckpt_dir}...")
+    snapshot_download(
+        repo_id=repo_id,
+        allow_patterns=["params.json", "consolidated.safetensors"],
+        local_dir=pixtral_ckpt_dir,
+        local_dir_use_symlinks=False,
+    )
+    orig_dtype = torch.get_default_dtype()
+    dtype = torch.bfloat16
+    torch.set_default_dtype(dtype)
+    # Load checkpoint file
+    ckpt_files = glob(os.path.join(pixtral_ckpt_dir, "*.safetensors"))
+    assert len(ckpt_files) == 1, "ckpt_dir should contain only one file"
+    ckpt_path = ckpt_files[0]
+    ckpt = load_file(ckpt_path)
+    # Split checkpoint into weights of vision encoder, projector, and LLM
+    vit_key_prefix = "vision_encoder."
+    vit_ckpt = {}
+    for key, value in ckpt.items():
+        if key.startswith(vit_key_prefix):
+            vit_ckpt[key.lstrip(vit_key_prefix)] = value
+    projector_key_prefix = "vision_language_adapter."
+    projector_ckpt = {}
+    substring_replacement_map = {
+        "w_in.": "projector.0.",
+        "w_out.": "projector.2.",
+    }
+    for key, value in ckpt.items():
+        if key.startswith(projector_key_prefix):
+            key = key.lstrip(projector_key_prefix)
+            for old, new in substring_replacement_map.items():
+                key = key.replace(old, new)
+            projector_ckpt[key] = value
+    llm_ckpt = {}
+    for key, value in ckpt.items():
+        if key.startswith(vit_key_prefix) or key.startswith(projector_key_prefix):
+            continue
+        llm_ckpt[key] = value
+    vlm_ckpt = {}
+    for key, value in llm_ckpt.items():
+        vlm_ckpt["model." + key] = value
+    for key, value in projector_ckpt.items():
+        vlm_ckpt["mm_projector." + key] = value
+    for key, value in vit_ckpt.items():
+        vlm_ckpt["vision_encoder." + key] = value
+    # Load config
+    config_path = os.path.join(pixtral_ckpt_dir, "params.json")
+    with open(config_path, "r") as f:
+        pixtral_config = json.load(f)
+    # Extract the vision encoder configuration
+    vision_encoder_config = {
+        "dim": pixtral_config["vision_encoder"]["hidden_size"],
+        "num_channels": pixtral_config["vision_encoder"]["num_channels"],
+        "image_size": pixtral_config["vision_encoder"]["image_size"],
+        "patch_size": pixtral_config["vision_encoder"]["patch_size"],
+        "rope_theta": pixtral_config["vision_encoder"]["rope_theta"],
+        "ffn_hidden_size": pixtral_config["vision_encoder"]["intermediate_size"],
+        "n_layers": pixtral_config["vision_encoder"]["num_hidden_layers"],
+        "n_heads": pixtral_config["vision_encoder"]["num_attention_heads"],
+        "n_kv_heads": pixtral_config["vision_encoder"]["num_attention_heads"],
+        "norm_type": "rmsnorm",
+        "norm_eps": pixtral_config["norm_eps"],
+        "image_token_id": pixtral_config["vision_encoder"]["image_token_id"],
+    }
+    # Configuration for the 400M ViT of Pixtral 12B VLM
+    vit_config = dict(
+        dim=1024,
+        num_channels=3,
+        image_size=1024,
+        patch_size=16,
+        rope_theta=10000,
+        ffn_hidden_size=4096,
+        n_layers=24,
+        n_heads=16,
+        n_kv_heads=16,
+        norm_type="rmsnorm",
+        norm_eps=1e-5,
+        image_token_id=10,
+    )
+    # Compare the two configurations
+    for key, value in vit_config.items():
+        assert vision_encoder_config[key] == value, f"Mismatch in {key}: {vision_encoder_config[key]} != {value}"
+    llm_config_keys = [
+        "dim",
+        "n_layers",
+        "head_dim",
+        "hidden_dim",
+        "n_heads",
+        "n_kv_heads",
+        "rope_theta",
+        "norm_eps",
+        "vocab_size",
+    ]
+    assert set(list(pixtral_config.keys())) == set(llm_config_keys + ["vision_encoder"]), "Config keys mismatch"
+    replace_map = {
+        "hidden_dim": "ffn_hidden_size",
+    }
+    llm_config = {}
+    for k, v in pixtral_config.items():
+        if k in llm_config_keys:
+            llm_config[replace_map.get(k, k)] = v
+        elif k == "vision_encoder":
+            llm_config["vision_encoder"] = vit_type
+        else:
+            raise ValueError(f"Unknown key: {k}")
+    ckpt_to_save = {"model": vlm_ckpt, "mm_projector": projector_ckpt, "vision_encoder": vit_ckpt}
+    torch.save(ckpt_to_save, save_path)
+    print(f"Model saved to {save_path}")
+    # Save config
+    config_path = os.path.join(save_dir, "config.json")
+    with open(config_path, "w") as f:
+        json.dump(llm_config, f)
+    torch.set_default_dtype(orig_dtype)  # Reset the default dtype
+    # Remove the original Pixtral checkpoint
+    shutil.rmtree(pixtral_ckpt_dir, ignore_errors=True)
+    print(f"Removed {pixtral_ckpt_dir}")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Convert pretrained Pixtral vision model weights to checkpoint and verify accuracy"
+    )
+    parser.add_argument("--checkpoint_dir", type=str, default="checkpoints", help="Path to the checkpoint directory")
+    parser.add_argument(
+        "--checkpoint_name",
+        type=str,
+        default="Pixtral-12B",
+        help="Name of the checkpoint",
+    )
+    parser.add_argument("--vit_type", default="pixtral-12b-vit", help="Type of ViT used in the Pixtral model")
+    args = parser.parse_args()
+    convert_pixtral_checkpoint(
+        checkpoint_dir=args.checkpoint_dir, checkpoint_name=args.checkpoint_name, vit_type=args.vit_type
+    )

download_diffusion.py ADDED Viewed

	@@ -0,0 +1,123 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+from pathlib import Path
+from huggingface_hub import snapshot_download
+from Cosmos.convert_pixtral_ckpt import convert_pixtral_checkpoint
+def parse_args():
+    parser = argparse.ArgumentParser(description="Download NVIDIA Cosmos-1.0 Diffusion models from Hugging Face")
+    parser.add_argument(
+        "--model_sizes",
+        nargs="*",
+        default=[
+            "7B",
+            "14B",
+        ],  # Download all by default
+        choices=["7B", "14B"],
+        help="Which model sizes to download. Possible values: 7B, 14B",
+    )
+    parser.add_argument(
+        "--model_types",
+        nargs="*",
+        default=[
+            "Text2World",
+            "Video2World",
+        ],  # Download all by default
+        choices=["Text2World", "Video2World"],
+        help="Which model types to download. Possible values: Text2World, Video2World",
+    )
+    parser.add_argument(
+        "--cosmos_version",
+        type=str,
+        default="1.0",
+        choices=["1.0"],
+        help="Which version of Cosmos to download. Only 1.0 is available at the moment.",
+    )
+    parser.add_argument(
+        "--checkpoint_dir", type=str, default="checkpoints", help="Directory to save the downloaded checkpoints."
+    )
+    args = parser.parse_args()
+    return args
+def main(args):
+    ORG_NAME = "nvidia"
+    # Mapping from size argument to Hugging Face repository name
+    model_map = {
+        "7B": "Cosmos-1.0-Diffusion-7B",
+        "14B": "Cosmos-1.0-Diffusion-14B",
+    }
+    # Additional models that are always downloaded
+    extra_models = [
+        "Cosmos-1.0-Guardrail",
+        "Cosmos-1.0-Tokenizer-CV8x8x8",
+    ]
+    if "Text2World" in args.model_types:
+        extra_models.append("Cosmos-1.0-Prompt-Upsampler-12B-Text2World")
+    # Create local checkpoints folder
+    checkpoints_dir = Path(args.checkpoint_dir)
+    checkpoints_dir.mkdir(parents=True, exist_ok=True)
+    download_kwargs = dict(allow_patterns=["README.md", "model.pt", "config.json", "*.jit"])
+    # Download the requested Autoregressive models
+    for size in args.model_sizes:
+        for model_type in args.model_types:
+            suffix = f"-{model_type}"
+            model_name = model_map[size] + suffix
+            repo_id = f"{ORG_NAME}/{model_name}"
+            local_dir = checkpoints_dir.joinpath(model_name)
+            local_dir.mkdir(parents=True, exist_ok=True)
+            print(f"Downloading {repo_id} to {local_dir}...")
+            snapshot_download(
+                repo_id=repo_id, local_dir=str(local_dir), local_dir_use_symlinks=False, **download_kwargs
+            )
+    # Download the always-included models
+    for model_name in extra_models:
+        repo_id = f"{ORG_NAME}/{model_name}"
+        local_dir = checkpoints_dir.joinpath(model_name)
+        local_dir.mkdir(parents=True, exist_ok=True)
+        print(f"Downloading {repo_id} to {local_dir}...")
+        # Download all files for Guardrail
+        snapshot_download(
+            repo_id=repo_id,
+            local_dir=str(local_dir),
+            local_dir_use_symlinks=False,
+        )
+    if "Video2World" in args.model_types:
+        # Prompt Upsampler for Cosmos-1.0-Diffusion-Video2World models
+        convert_pixtral_checkpoint(
+            checkpoint_dir=args.checkpoint_dir,
+            checkpoint_name="Pixtral-12B",
+            vit_type="pixtral-12b-vit",
+        )
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)

guardrail_presets.py ADDED Viewed

	@@ -0,0 +1,77 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import numpy as np
+from cosmos1.models.guardrail.aegis.aegis import Aegis
+from cosmos1.models.guardrail.blocklist.blocklist import Blocklist
+from cosmos1.models.guardrail.common.core import GuardrailRunner
+from cosmos1.models.guardrail.face_blur_filter.face_blur_filter import RetinaFaceFilter
+from cosmos1.models.guardrail.video_content_safety_filter.video_content_safety_filter import VideoContentSafetyFilter
+from Cosmos.utils import log
+def create_text_guardrail_runner(checkpoint_dir: str) -> GuardrailRunner:
+    """Create the text guardrail runner."""
+    blocklist_checkpoint_dir = os.path.join(checkpoint_dir, "blocklist")
+    aegis_checkpoint_dir = os.path.join(checkpoint_dir, "aegis")
+    return GuardrailRunner(safety_models=[Blocklist(blocklist_checkpoint_dir), Aegis(aegis_checkpoint_dir)])
+def create_video_guardrail_runner(checkpoint_dir: str) -> GuardrailRunner:
+    """Create the video guardrail runner."""
+    video_filter_checkpoint_dir = os.path.join(checkpoint_dir, "video_content_safety_filter")
+    retinaface_checkpoint_path = os.path.join(checkpoint_dir, "face_blur_filter/Resnet50_Final.pth")
+    return GuardrailRunner(
+        safety_models=[VideoContentSafetyFilter(video_filter_checkpoint_dir)],
+        postprocessors=[RetinaFaceFilter(retinaface_checkpoint_path)],
+    )
+def run_text_guardrail(prompt: str, guardrail_runner: GuardrailRunner) -> bool:
+    """Run the text guardrail on the prompt, checking for content safety.
+    Args:
+        prompt: The text prompt.
+        guardrail_runner: The text guardrail runner.
+    Returns:
+        bool: Whether the prompt is safe.
+    """
+    is_safe, message = guardrail_runner.run_safety_check(prompt)
+    if not is_safe:
+        log.critical(f"GUARDRAIL BLOCKED: {message}")
+    return is_safe
+def run_video_guardrail(frames: np.ndarray, guardrail_runner: GuardrailRunner) -> np.ndarray | None:
+    """Run the video guardrail on the frames, checking for content safety and applying face blur.
+    Args:
+        frames: The frames of the generated video.
+        guardrail_runner: The video guardrail runner.
+    Returns:
+        The processed frames if safe, otherwise None.
+    """
+    is_safe, message = guardrail_runner.run_safety_check(frames)
+    if not is_safe:
+        log.critical(f"GUARDRAIL BLOCKED: {message}")
+        return None
+    frames = guardrail_runner.postprocess(frames)
+    return frames

inference_utils.py ADDED Viewed

	@@ -0,0 +1,726 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import importlib
+from contextlib import contextmanager
+from typing import List, NamedTuple, Optional, Tuple
+from Cosmos.utils import misc
+import einops
+import imageio
+import numpy as np
+import torch
+import torchvision.transforms.functional as transforms_F
+from Cosmos.model_t2w import DiffusionT2WModel
+from Cosmos.model_v2w import DiffusionV2WModel
+from Cosmos.utils import log
+from Cosmos.utils.config_helper import get_config_module, override
+from Cosmos.utils.io import load_from_fileobj
+TORCH_VERSION: Tuple[int, ...] = tuple(int(x) for x in torch.__version__.split(".")[:2])
+if TORCH_VERSION >= (1, 11):
+    from torch.ao import quantization
+    from torch.ao.quantization import FakeQuantizeBase, ObserverBase
+elif (
+    TORCH_VERSION >= (1, 8)
+    and hasattr(torch.quantization, "FakeQuantizeBase")
+    and hasattr(torch.quantization, "ObserverBase")
+):
+    from torch import quantization
+    from torch.quantization import FakeQuantizeBase, ObserverBase
+DEFAULT_AUGMENT_SIGMA = 0.001
+def add_common_arguments(parser):
+    """Add common command line arguments for text2world and video2world generation.
+    Args:
+        parser (ArgumentParser): Argument parser to add arguments to
+    The arguments include:
+    - checkpoint_dir: Base directory containing model weights
+    - tokenizer_dir: Directory containing tokenizer weights
+    - video_save_name: Output video filename for single video generation
+    - video_save_folder: Output directory for batch video generation
+    - prompt: Text prompt for single video generation
+    - batch_input_path: Path to JSONL file with input prompts for batch video generation
+    - negative_prompt: Text prompt describing undesired attributes
+    - num_steps: Number of diffusion sampling steps
+    - guidance: Classifier-free guidance scale
+    - num_video_frames: Number of frames to generate
+    - height/width: Output video dimensions
+    - fps: Output video frame rate
+    - seed: Random seed for reproducibility
+    - Various model offloading flags
+    """
+    parser.add_argument(
+        "--checkpoint_dir", type=str, default="checkpoints", help="Base directory containing model checkpoints"
+    )
+    parser.add_argument(
+        "--tokenizer_dir",
+        type=str,
+        default="Cosmos-1.0-Tokenizer-CV8x8x8",
+        help="Tokenizer weights directory relative to checkpoint_dir",
+    )
+    parser.add_argument(
+        "--video_save_name",
+        type=str,
+        default="output",
+        help="Output filename for generating a single video",
+    )
+    parser.add_argument(
+        "--video_save_folder",
+        type=str,
+        default="outputs/",
+        help="Output folder for generating a batch of videos",
+    )
+    parser.add_argument(
+        "--prompt",
+        type=str,
+        help="Text prompt for generating a single video",
+    )
+    parser.add_argument(
+        "--batch_input_path",
+        type=str,
+        help="Path to a JSONL file of input prompts for generating a batch of videos",
+    )
+    parser.add_argument(
+        "--negative_prompt",
+        type=str,
+        default="The video captures a series of frames showing ugly scenes, static with no motion, motion blur, "
+        "over-saturation, shaky footage, low resolution, grainy texture, pixelated images, poorly lit areas, "
+        "underexposed and overexposed scenes, poor color balance, washed out colors, choppy sequences, "
+        "jerky movements, low frame rate, artifacting, color banding, unnatural transitions, outdated special "
+        "effects, fake elements, unconvincing visuals, poorly edited content, jump cuts, visual noise, and "
+        "flickering. Overall, the video is of poor quality.",
+        help="Negative prompt for the video",
+    )
+    parser.add_argument("--num_steps", type=int, default=35, help="Number of diffusion sampling steps")
+    parser.add_argument("--guidance", type=float, default=7, help="Guidance scale value")
+    parser.add_argument("--num_video_frames", type=int, default=121, help="Number of video frames to sample")
+    parser.add_argument("--height", type=int, default=704, help="Height of video to sample")
+    parser.add_argument("--width", type=int, default=1280, help="Width of video to sample")
+    parser.add_argument("--fps", type=int, default=24, help="FPS of the sampled video")
+    parser.add_argument("--seed", type=int, default=1, help="Random seed")
+    parser.add_argument(
+        "--disable_prompt_upsampler",
+        action="store_true",
+        help="Disable prompt upsampling",
+    )
+    parser.add_argument(
+        "--offload_diffusion_transformer",
+        action="store_true",
+        help="Offload DiT after inference",
+    )
+    parser.add_argument(
+        "--offload_tokenizer",
+        action="store_true",
+        help="Offload tokenizer after inference",
+    )
+    parser.add_argument(
+        "--offload_text_encoder_model",
+        action="store_true",
+        help="Offload text encoder model after inference",
+    )
+    parser.add_argument(
+        "--offload_prompt_upsampler",
+        action="store_true",
+        help="Offload prompt upsampler after inference",
+    )
+    parser.add_argument(
+        "--offload_guardrail_models",
+        action="store_true",
+        help="Offload guardrail models after inference",
+    )
+def validate_args(args: argparse.Namespace, inference_type: str) -> None:
+    """Validate command line arguments for text2world and video2world generation."""
+    assert inference_type in [
+        "text2world",
+        "video2world",
+    ], "Invalid inference_type, must be 'text2world' or 'video2world'"
+    # Validate prompt/image/video args for single or batch generation
+    if inference_type == "text2world" or (inference_type == "video2world" and args.disable_prompt_upsampler):
+        assert args.prompt or args.batch_input_path, "--prompt or --batch_input_path must be provided."
+    if inference_type == "video2world" and not args.batch_input_path:
+        assert (
+            args.input_image_or_video_path
+        ), "--input_image_or_video_path must be provided for single video generation."
+class _IncompatibleKeys(
+    NamedTuple(
+        "IncompatibleKeys",
+        [
+            ("missing_keys", List[str]),
+            ("unexpected_keys", List[str]),
+            ("incorrect_shapes", List[Tuple[str, Tuple[int], Tuple[int]]]),
+        ],
+    )
+):
+    pass
+def non_strict_load_model(model: torch.nn.Module, checkpoint_state_dict: dict) -> _IncompatibleKeys:
+    """Load a model checkpoint with non-strict matching, handling shape mismatches.
+    Args:
+        model (torch.nn.Module): Model to load weights into
+        checkpoint_state_dict (dict): State dict from checkpoint
+    Returns:
+        _IncompatibleKeys: Named tuple containing:
+            - missing_keys: Keys present in model but missing from checkpoint
+            - unexpected_keys: Keys present in checkpoint but not in model
+            - incorrect_shapes: Keys with mismatched tensor shapes
+    The function handles special cases like:
+    - Uninitialized parameters
+    - Quantization observers
+    - TransformerEngine FP8 states
+    """
+    # workaround https://github.com/pytorch/pytorch/issues/24139
+    model_state_dict = model.state_dict()
+    incorrect_shapes = []
+    for k in list(checkpoint_state_dict.keys()):
+        if k in model_state_dict:
+            if "_extra_state" in k:  # Key introduced by TransformerEngine for FP8
+                log.debug(f"Skipping key {k} introduced by TransformerEngine for FP8 in the checkpoint.")
+                continue
+            model_param = model_state_dict[k]
+            # Allow mismatch for uninitialized parameters
+            if TORCH_VERSION >= (1, 8) and isinstance(model_param, torch.nn.parameter.UninitializedParameter):
+                continue
+            if not isinstance(model_param, torch.Tensor):
+                raise ValueError(
+                    f"Find non-tensor parameter {k} in the model. type: {type(model_param)} {type(checkpoint_state_dict[k])}, please check if this key is safe to skip or not."
+                )
+            shape_model = tuple(model_param.shape)
+            shape_checkpoint = tuple(checkpoint_state_dict[k].shape)
+            if shape_model != shape_checkpoint:
+                has_observer_base_classes = (
+                    TORCH_VERSION >= (1, 8)
+                    and hasattr(quantization, "ObserverBase")
+                    and hasattr(quantization, "FakeQuantizeBase")
+                )
+                if has_observer_base_classes:
+                    # Handle the special case of quantization per channel observers,
+                    # where buffer shape mismatches are expected.
+                    def _get_module_for_key(model: torch.nn.Module, key: str) -> torch.nn.Module:
+                        # foo.bar.param_or_buffer_name -> [foo, bar]
+                        key_parts = key.split(".")[:-1]
+                        cur_module = model
+                        for key_part in key_parts:
+                            cur_module = getattr(cur_module, key_part)
+                        return cur_module
+                    cls_to_skip = (
+                        ObserverBase,
+                        FakeQuantizeBase,
+                    )
+                    target_module = _get_module_for_key(model, k)
+                    if isinstance(target_module, cls_to_skip):
+                        # Do not remove modules with expected shape mismatches
+                        # them from the state_dict loading. They have special logic
+                        # in _load_from_state_dict to handle the mismatches.
+                        continue
+                incorrect_shapes.append((k, shape_checkpoint, shape_model))
+                checkpoint_state_dict.pop(k)
+    incompatible = model.load_state_dict(checkpoint_state_dict, strict=False)
+    # Remove keys with "_extra_state" suffix, which are non-parameter items introduced by TransformerEngine for FP8 handling
+    missing_keys = [k for k in incompatible.missing_keys if "_extra_state" not in k]
+    unexpected_keys = [k for k in incompatible.unexpected_keys if "_extra_state" not in k]
+    return _IncompatibleKeys(
+        missing_keys=missing_keys,
+        unexpected_keys=unexpected_keys,
+        incorrect_shapes=incorrect_shapes,
+    )
+@contextmanager
+def skip_init_linear():
+    # skip init of nn.Linear
+    orig_reset_parameters = torch.nn.Linear.reset_parameters
+    torch.nn.Linear.reset_parameters = lambda x: x
+    xavier_uniform_ = torch.nn.init.xavier_uniform_
+    torch.nn.init.xavier_uniform_ = lambda x: x
+    yield
+    torch.nn.Linear.reset_parameters = orig_reset_parameters
+    torch.nn.init.xavier_uniform_ = xavier_uniform_
+def load_model_by_config(
+    config_job_name,
+    config_file="projects/cosmos_video/config/config.py",
+    model_class=DiffusionT2WModel,
+):
+    config_module = get_config_module(config_file)
+    config = importlib.import_module(config_module).make_config()
+    config = override(config, ["--", f"experiment={config_job_name}"])
+    # Check that the config is valid
+    config.validate()
+    # Freeze the config so developers don't change it during training.
+    config.freeze()  # type: ignore
+    # Initialize model
+    with skip_init_linear():
+        model = model_class(config.model)
+    return model
+def load_network_model(model: DiffusionT2WModel, ckpt_path: str):
+    with skip_init_linear():
+        model.set_up_model()
+    net_state_dict = torch.load(ckpt_path, map_location="cpu", weights_only=True)
+    log.debug(non_strict_load_model(model.model, net_state_dict))
+    model.cuda()
+def load_tokenizer_model(model: DiffusionT2WModel, tokenizer_dir: str):
+    with skip_init_linear():
+        model.set_up_tokenizer(tokenizer_dir)
+    model.cuda()
+def prepare_data_batch(
+    height: int,
+    width: int,
+    num_frames: int,
+    fps: int,
+    prompt_embedding: torch.Tensor,
+    negative_prompt_embedding: Optional[torch.Tensor] = None,
+):
+    """Prepare input batch tensors for video generation.
+    Args:
+        height (int): Height of video frames
+        width (int): Width of video frames
+        num_frames (int): Number of frames to generate
+        fps (int): Frames per second
+        prompt_embedding (torch.Tensor): Encoded text prompt embeddings
+        negative_prompt_embedding (torch.Tensor, optional): Encoded negative prompt embeddings
+    Returns:
+        dict: Batch dictionary containing:
+            - video: Zero tensor of target video shape
+            - t5_text_mask: Attention mask for text embeddings
+            - image_size: Target frame dimensions
+            - fps: Target frame rate
+            - num_frames: Number of frames
+            - padding_mask: Frame padding mask
+            - t5_text_embeddings: Prompt embeddings
+            - neg_t5_text_embeddings: Negative prompt embeddings (if provided)
+            - neg_t5_text_mask: Mask for negative embeddings (if provided)
+    """
+    # Create base data batch
+    data_batch = {
+        "video": torch.zeros((1, 3, num_frames, height, width), dtype=torch.uint8).cuda(),
+        "t5_text_mask": torch.ones(1, 512, dtype=torch.bfloat16).cuda(),
+        "image_size": torch.tensor([[height, width, height, width]] * 1, dtype=torch.bfloat16).cuda(),
+        "fps": torch.tensor([fps] * 1, dtype=torch.bfloat16).cuda(),
+        "num_frames": torch.tensor([num_frames] * 1, dtype=torch.bfloat16).cuda(),
+        "padding_mask": torch.zeros((1, 1, height, width), dtype=torch.bfloat16).cuda(),
+    }
+    # Handle text embeddings
+    t5_embed = prompt_embedding.to(dtype=torch.bfloat16).cuda()
+    data_batch["t5_text_embeddings"] = t5_embed
+    if negative_prompt_embedding is not None:
+        neg_t5_embed = negative_prompt_embedding.to(dtype=torch.bfloat16).cuda()
+        data_batch["neg_t5_text_embeddings"] = neg_t5_embed
+        data_batch["neg_t5_text_mask"] = torch.ones(1, 512, dtype=torch.bfloat16).cuda()
+    return data_batch
+def get_video_batch(model, prompt_embedding, negative_prompt_embedding, height, width, fps, num_video_frames):
+    """Prepare complete input batch for video generation including latent dimensions.
+    Args:
+        model: Diffusion model instance
+        prompt_embedding (torch.Tensor): Text prompt embeddings
+        negative_prompt_embedding (torch.Tensor): Negative prompt embeddings
+        height (int): Output video height
+        width (int): Output video width
+        fps (int): Output video frame rate
+        num_video_frames (int): Number of frames to generate
+    Returns:
+        tuple:
+            - data_batch (dict): Complete model input batch
+            - state_shape (list): Shape of latent state [C,T,H,W] accounting for VAE compression
+    """
+    raw_video_batch = prepare_data_batch(
+        height=height,
+        width=width,
+        num_frames=num_video_frames,
+        fps=fps,
+        prompt_embedding=prompt_embedding,
+        negative_prompt_embedding=negative_prompt_embedding,
+    )
+    state_shape = [
+        model.tokenizer.channel,
+        model.tokenizer.get_latent_num_frames(num_video_frames),
+        height // model.tokenizer.spatial_compression_factor,
+        width // model.tokenizer.spatial_compression_factor,
+    ]
+    return raw_video_batch, state_shape
+def generate_world_from_text(
+    model: DiffusionT2WModel,
+    state_shape: list[int],
+    is_negative_prompt: bool,
+    data_batch: dict,
+    guidance: float,
+    num_steps: int,
+    seed: int,
+):
+    """Generate video from text prompt using diffusion model.
+    Args:
+        model (DiffusionT2WModel): Text-to-video diffusion model
+        state_shape (list[int]): Latent state dimensions [C,T,H,W]
+        is_negative_prompt (bool): Whether negative prompt is provided
+        data_batch (dict): Model input batch with embeddings
+        guidance (float): Classifier-free guidance scale
+        num_steps (int): Number of diffusion sampling steps
+        seed (int): Random seed for reproducibility
+    Returns:
+        np.ndarray: Generated video frames [T,H,W,C], range [0,255]
+    The function:
+    1. Initializes random latent with maximum noise
+    2. Performs guided diffusion sampling
+    3. Decodes latents to pixel space
+    """
+    x_sigma_max = (
+        misc.arch_invariant_rand(
+            (1,) + tuple(state_shape),
+            torch.float32,
+            model.tensor_kwargs["device"],
+            seed,
+        )
+        * model.sde.sigma_max
+    )
+    # Generate video
+    sample = model.generate_samples_from_batch(
+        data_batch,
+        guidance=guidance,
+        state_shape=state_shape,
+        num_steps=num_steps,
+        is_negative_prompt=is_negative_prompt,
+        seed=seed,
+        x_sigma_max=x_sigma_max,
+    )
+    return sample
+def generate_world_from_video(
+    model: DiffusionV2WModel,
+    state_shape: list[int],
+    is_negative_prompt: bool,
+    data_batch: dict,
+    guidance: float,
+    num_steps: int,
+    seed: int,
+    condition_latent: torch.Tensor,
+    num_input_frames: int,
+) -> Tuple[np.array, list, list]:
+    """Generate video using a conditioning video/image input.
+    Args:
+        model (DiffusionV2WModel): The diffusion model instance
+        state_shape (list[int]): Shape of the latent state [C,T,H,W]
+        is_negative_prompt (bool): Whether negative prompt is provided
+        data_batch (dict): Batch containing model inputs including text embeddings
+        guidance (float): Classifier-free guidance scale for sampling
+        num_steps (int): Number of diffusion sampling steps
+        seed (int): Random seed for generation
+        condition_latent (torch.Tensor): Latent tensor from conditioning video/image file
+        num_input_frames (int): Number of input frames
+    Returns:
+        np.array: Generated video frames in shape [T,H,W,C], range [0,255]
+    """
+    assert not model.config.conditioner.video_cond_bool.sample_tokens_start_from_p_or_i, "not supported"
+    augment_sigma = DEFAULT_AUGMENT_SIGMA
+    if condition_latent.shape[2] < state_shape[1]:
+        # Padding condition latent to state shape
+        b, c, t, h, w = condition_latent.shape
+        condition_latent = torch.cat(
+            [
+                condition_latent,
+                condition_latent.new_zeros(b, c, state_shape[1] - t, h, w),
+            ],
+            dim=2,
+        ).contiguous()
+    num_of_latent_condition = compute_num_latent_frames(model, num_input_frames)
+    x_sigma_max = (
+        misc.arch_invariant_rand(
+            (1,) + tuple(state_shape),
+            torch.float32,
+            model.tensor_kwargs["device"],
+            seed,
+        )
+        * model.sde.sigma_max
+    )
+    sample = model.generate_samples_from_batch(
+        data_batch,
+        guidance=guidance,
+        state_shape=state_shape,
+        num_steps=num_steps,
+        is_negative_prompt=is_negative_prompt,
+        seed=seed,
+        condition_latent=condition_latent,
+        num_condition_t=num_of_latent_condition,
+        condition_video_augment_sigma_in_inference=augment_sigma,
+        x_sigma_max=x_sigma_max,
+    )
+    return sample
+def read_video_or_image_into_frames_BCTHW(
+    input_path: str,
+    input_path_format: str = "mp4",
+    H: int = None,
+    W: int = None,
+    normalize: bool = True,
+    max_frames: int = -1,
+    also_return_fps: bool = False,
+) -> torch.Tensor:
+    """Read video or image file and convert to tensor format.
+    Args:
+        input_path (str): Path to input video/image file
+        input_path_format (str): Format of input file (default: "mp4")
+        H (int, optional): Height to resize frames to
+        W (int, optional): Width to resize frames to
+        normalize (bool): Whether to normalize pixel values to [-1,1] (default: True)
+        max_frames (int): Maximum number of frames to read (-1 for all frames)
+        also_return_fps (bool): Whether to return fps along with frames
+    Returns:
+        torch.Tensor | tuple: Video tensor in shape [B,C,T,H,W], optionally with fps if requested
+    """
+    log.debug(f"Reading video from {input_path}")
+    loaded_data = load_from_fileobj(input_path, format=input_path_format)
+    frames, meta_data = loaded_data
+    if input_path.endswith(".png") or input_path.endswith(".jpg") or input_path.endswith(".jpeg"):
+        frames = np.array(frames[0])  # HWC, [0,255]
+        if frames.shape[-1] > 3:  # RGBA, set the transparent to white
+            # Separate the RGB and Alpha channels
+            rgb_channels = frames[..., :3]
+            alpha_channel = frames[..., 3] / 255.0  # Normalize alpha channel to [0, 1]
+            # Create a white background
+            white_bg = np.ones_like(rgb_channels) * 255  # White background in RGB
+            # Blend the RGB channels with the white background based on the alpha channel
+            frames = (rgb_channels * alpha_channel[..., None] + white_bg * (1 - alpha_channel[..., None])).astype(
+                np.uint8
+            )
+        frames = [frames]
+        fps = 0
+    else:
+        fps = int(meta_data.get("fps"))
+    if max_frames != -1:
+        frames = frames[:max_frames]
+    input_tensor = np.stack(frames, axis=0)
+    input_tensor = einops.rearrange(input_tensor, "t h w c -> t c h w")
+    if normalize:
+        input_tensor = input_tensor / 128.0 - 1.0
+        input_tensor = torch.from_numpy(input_tensor).bfloat16()  # TCHW
+        log.debug(f"Raw data shape: {input_tensor.shape}")
+        if H is not None and W is not None:
+            input_tensor = transforms_F.resize(
+                input_tensor,
+                size=(H, W),  # type: ignore
+                interpolation=transforms_F.InterpolationMode.BICUBIC,
+                antialias=True,
+            )
+    input_tensor = einops.rearrange(input_tensor, "(b t) c h w -> b c t h w", b=1)
+    if normalize:
+        input_tensor = input_tensor.to("cuda")
+    log.debug(f"Load shape {input_tensor.shape} value {input_tensor.min()}, {input_tensor.max()}")
+    if also_return_fps:
+        return input_tensor, fps
+    return input_tensor
+def compute_num_latent_frames(model: DiffusionV2WModel, num_input_frames: int, downsample_factor=8) -> int:
+    """This function computes the number of latent frames given the number of input frames.
+    Args:
+        model (DiffusionV2WModel): video generation model
+        num_input_frames (int): number of input frames
+        downsample_factor (int): downsample factor for temporal reduce
+    Returns:
+        int: number of latent frames
+    """
+    num_latent_frames = (
+        num_input_frames
+        // model.tokenizer.video_vae.pixel_chunk_duration
+        * model.tokenizer.video_vae.latent_chunk_duration
+    )
+    if num_input_frames % model.tokenizer.video_vae.latent_chunk_duration == 1:
+        num_latent_frames += 1
+    elif num_input_frames % model.tokenizer.video_vae.latent_chunk_duration > 1:
+        assert (
+            num_input_frames % model.tokenizer.video_vae.pixel_chunk_duration - 1
+        ) % downsample_factor == 0, f"num_input_frames % model.tokenizer.video_vae.pixel_chunk_duration - 1 must be divisible by {downsample_factor}"
+        num_latent_frames += (
+            1 + (num_input_frames % model.tokenizer.video_vae.pixel_chunk_duration - 1) // downsample_factor
+        )
+    return num_latent_frames
+def create_condition_latent_from_input_frames(
+    model: DiffusionV2WModel,
+    input_frames: torch.Tensor,
+    num_frames_condition: int = 25,
+):
+    """Create condition latent for video generation from input frames.
+    Takes the last num_frames_condition frames from input as conditioning.
+    Args:
+        model (DiffusionV2WModel): Video generation model
+        input_frames (torch.Tensor): Input video tensor [B,C,T,H,W], range [-1,1]
+        num_frames_condition (int): Number of frames to use for conditioning
+    Returns:
+        tuple: (condition_latent, encode_input_frames) where:
+            - condition_latent (torch.Tensor): Encoded latent condition [B,C,T,H,W]
+            - encode_input_frames (torch.Tensor): Padded input frames used for encoding
+    """
+    B, C, T, H, W = input_frames.shape
+    num_frames_encode = (
+        model.tokenizer.pixel_chunk_duration
+    )  # (model.state_shape[1] - 1) / model.vae.pixel_chunk_duration + 1
+    log.debug(
+        f"num_frames_encode not set, set it based on pixel chunk duration and model state shape: {num_frames_encode}"
+    )
+    log.debug(
+        f"Create condition latent from input frames {input_frames.shape}, value {input_frames.min()}, {input_frames.max()}, dtype {input_frames.dtype}"
+    )
+    assert (
+        input_frames.shape[2] >= num_frames_condition
+    ), f"input_frames not enough for condition, require at least {num_frames_condition}, get {input_frames.shape[2]}, {input_frames.shape}"
+    assert (
+        num_frames_encode >= num_frames_condition
+    ), f"num_frames_encode should be larger than num_frames_condition, get {num_frames_encode}, {num_frames_condition}"
+    # Put the conditioal frames to the begining of the video, and pad the end with zero
+    condition_frames = input_frames[:, :, -num_frames_condition:]
+    padding_frames = condition_frames.new_zeros(B, C, num_frames_encode - num_frames_condition, H, W)
+    encode_input_frames = torch.cat([condition_frames, padding_frames], dim=2)
+    log.debug(
+        f"create latent with input shape {encode_input_frames.shape} including padding {num_frames_encode - num_frames_condition} at the end"
+    )
+    latent = model.encode(encode_input_frames)
+    return latent, encode_input_frames
+def get_condition_latent(
+    model: DiffusionV2WModel,
+    input_image_or_video_path: str,
+    num_input_frames: int = 1,
+    state_shape: list[int] = None,
+):
+    """Get condition latent from input image/video file.
+    Args:
+        model (DiffusionV2WModel): Video generation model
+        input_image_or_video_path (str): Path to conditioning image/video
+        num_input_frames (int): Number of input frames for video2world prediction
+    Returns:
+        tuple: (condition_latent, input_frames) where:
+            - condition_latent (torch.Tensor): Encoded latent condition [B,C,T,H,W]
+            - input_frames (torch.Tensor): Input frames tensor [B,C,T,H,W]
+    """
+    if state_shape is None:
+        state_shape = model.state_shape
+    assert num_input_frames > 0, "num_input_frames must be greater than 0"
+    H, W = (
+        state_shape[-2] * model.tokenizer.spatial_compression_factor,
+        state_shape[-1] * model.tokenizer.spatial_compression_factor,
+    )
+    input_path_format = input_image_or_video_path.split(".")[-1]
+    input_frames = read_video_or_image_into_frames_BCTHW(
+        input_image_or_video_path,
+        input_path_format=input_path_format,
+        H=H,
+        W=W,
+    )
+    condition_latent, _ = create_condition_latent_from_input_frames(model, input_frames, num_input_frames)
+    condition_latent = condition_latent.to(torch.bfloat16)
+    return condition_latent
+def check_input_frames(input_path: str, required_frames: int) -> bool:
+    """Check if input video/image has sufficient frames.
+    Args:
+        input_path: Path to input video or image
+        required_frames: Number of required frames
+    Returns:
+        np.ndarray of frames if valid, None if invalid
+    """
+    if input_path.endswith((".jpg", ".jpeg", ".png")):
+        if required_frames > 1:
+            log.error(f"Input ({input_path}) is an image but {required_frames} frames are required")
+            return False
+        return True  # Let the pipeline handle image loading
+    # For video input
+    try:
+        vid = imageio.get_reader(input_path, "ffmpeg")
+        frame_count = vid.count_frames()
+        if frame_count < required_frames:
+            log.error(f"Input video has {frame_count} frames but {required_frames} frames are required")
+            return False
+        else:
+            return True
+    except Exception as e:
+        log.error(f"Error reading video file {input_path}: {e}")
+        return False

model_t2w.py ADDED Viewed

	@@ -0,0 +1,282 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Callable, Dict, Optional, Tuple
+from Cosmos.utils import misc
+import torch
+from torch import Tensor
+from Cosmos.conditioner import CosmosCondition
+from cosmos1.models.diffusion.diffusion.functional.batch_ops import batch_mul
+from cosmos1.models.diffusion.diffusion.modules.denoiser_scaling import EDMScaling
+from cosmos1.models.diffusion.diffusion.modules.res_sampler import COMMON_SOLVER_OPTIONS, Sampler
+from Cosmos.types import DenoisePrediction
+from Cosmos.module.blocks import FourierFeatures
+from Cosmos.module.pretrained_vae import BaseVAE
+from Cosmos.utils import log
+from Cosmos.lazy_config import instantiate as lazy_instantiate
+class EDMSDE:
+    def __init__(
+        self,
+        sigma_max: float,
+        sigma_min: float,
+    ):
+        self.sigma_max = sigma_max
+        self.sigma_min = sigma_min
+class DiffusionT2WModel(torch.nn.Module):
+    """Text-to-world diffusion model that generates video frames from text descriptions.
+    This model implements a diffusion-based approach for generating videos conditioned on text input.
+    It handles the full pipeline including encoding/decoding through a VAE, diffusion sampling,
+    and classifier-free guidance.
+    """
+    def __init__(self, config):
+        """Initialize the diffusion model.
+        Args:
+            config: Configuration object containing model parameters and architecture settings
+        """
+        super().__init__()
+        # Initialize trained_data_record with defaultdict, key: image, video, iteration
+        self.config = config
+        self.precision = {
+            "float32": torch.float32,
+            "float16": torch.float16,
+            "bfloat16": torch.bfloat16,
+        }[config.precision]
+        self.tensor_kwargs = {"device": "cuda", "dtype": self.precision}
+        log.debug(f"DiffusionModel: precision {self.precision}")
+        # Timer passed to network to detect slow ranks.
+        # 1. set data keys and data information
+        self.sigma_data = config.sigma_data
+        self.state_shape = list(config.latent_shape)
+        self.setup_data_key()
+        # 2. setup up diffusion processing and scaling~(pre-condition), sampler
+        self.sde = EDMSDE(sigma_max=80, sigma_min=0.0002)
+        self.sampler = Sampler()
+        self.scaling = EDMScaling(self.sigma_data)
+        self.tokenizer = None
+        self.model = None
+    @property
+    def net(self):
+        return self.model.net
+    @property
+    def conditioner(self):
+        return self.model.conditioner
+    @property
+    def logvar(self):
+        return self.model.logvar
+    def set_up_tokenizer(self, tokenizer_dir: str):
+        self.tokenizer: BaseVAE = lazy_instantiate(self.config.tokenizer)
+        self.tokenizer.load_weights(tokenizer_dir)
+        if hasattr(self.tokenizer, "reset_dtype"):
+            self.tokenizer.reset_dtype()
+    @misc.timer("DiffusionModel: set_up_model")
+    def set_up_model(self, memory_format: torch.memory_format = torch.preserve_format):
+        """Initialize the core model components including network, conditioner and logvar."""
+        self.model = self.build_model()
+        self.model = self.model.to(memory_format=memory_format, **self.tensor_kwargs)
+    def build_model(self) -> torch.nn.ModuleDict:
+        """Construct the model's neural network components.
+        Returns:
+            ModuleDict containing the network, conditioner and logvar components
+        """
+        config = self.config
+        net = lazy_instantiate(config.net)
+        conditioner = lazy_instantiate(config.conditioner)
+        logvar = torch.nn.Sequential(
+            FourierFeatures(num_channels=128, normalize=True), torch.nn.Linear(128, 1, bias=False)
+        )
+        return torch.nn.ModuleDict(
+            {
+                "net": net,
+                "conditioner": conditioner,
+                "logvar": logvar,
+            }
+        )
+    @torch.no_grad()
+    def encode(self, state: torch.Tensor) -> torch.Tensor:
+        """Encode input state into latent representation using VAE.
+        Args:
+            state: Input tensor to encode
+        Returns:
+            Encoded latent representation scaled by sigma_data
+        """
+        return self.tokenizer.encode(state) * self.sigma_data
+    @torch.no_grad()
+    def decode(self, latent: torch.Tensor) -> torch.Tensor:
+        """Decode latent representation back to pixel space using VAE.
+        Args:
+            latent: Latent tensor to decode
+        Returns:
+            Decoded tensor in pixel space
+        """
+        return self.tokenizer.decode(latent / self.sigma_data)
+    def setup_data_key(self) -> None:
+        """Configure input data keys for video and image data."""
+        self.input_data_key = self.config.input_data_key  # by default it is video key for Video diffusion model
+    def get_x0_fn_from_batch(
+        self,
+        data_batch: Dict,
+        guidance: float = 1.5,
+        is_negative_prompt: bool = False,
+    ) -> Callable:
+        """
+        Generates a callable function `x0_fn` based on the provided data batch and guidance factor.
+        This function processes the input data batch through a conditioning workflow to obtain
+        conditioned and unconditioned states. It then defines a nested function `x0_fn` which
+        applies denoising on an input `noise_x` at a given noise level `sigma`.
+        Args:
+            data_batch: A batch of data used for conditioning. Format should align with conditioner
+            guidance: Scalar value that modulates influence of conditioned vs unconditioned state
+            is_negative_prompt: Use negative prompt t5 in uncondition if true
+        Returns:
+            A function `x0_fn(noise_x, sigma)` that takes noise_x and sigma, returns x0 prediction
+        """
+        if is_negative_prompt:
+            condition, uncondition = self.conditioner.get_condition_with_negative_prompt(data_batch)
+        else:
+            condition, uncondition = self.conditioner.get_condition_uncondition(data_batch)
+        def x0_fn(noise_x: torch.Tensor, sigma: torch.Tensor) -> torch.Tensor:
+            cond_x0 = self.denoise(noise_x, sigma, condition).x0
+            uncond_x0 = self.denoise(noise_x, sigma, uncondition).x0
+            raw_x0 = cond_x0 + guidance * (cond_x0 - uncond_x0)
+            if "guided_image" in data_batch:
+                # replacement trick that enables inpainting with base model
+                assert "guided_mask" in data_batch, "guided_mask should be in data_batch if guided_image is present"
+                guide_image = data_batch["guided_image"]
+                guide_mask = data_batch["guided_mask"]
+                raw_x0 = guide_mask * guide_image + (1 - guide_mask) * raw_x0
+            return raw_x0
+        return x0_fn
+    def denoise(self, xt: torch.Tensor, sigma: torch.Tensor, condition: CosmosCondition) -> DenoisePrediction:
+        """
+        Performs denoising on the input noise data, noise level, and condition
+        Args:
+            xt (torch.Tensor): The input noise data.
+            sigma (torch.Tensor): The noise level.
+            condition (CosmosCondition): conditional information, generated from self.conditioner
+        Returns:
+            DenoisePrediction: The denoised prediction, it includes clean data predicton (x0), \
+                noise prediction (eps_pred) and optional confidence (logvar).
+        """
+        xt = xt.to(**self.tensor_kwargs)
+        sigma = sigma.to(**self.tensor_kwargs)
+        # get precondition for the network
+        c_skip, c_out, c_in, c_noise = self.scaling(sigma=sigma)
+        # forward pass through the network
+        net_output = self.net(
+            x=batch_mul(c_in, xt),  # Eq. 7 of https://arxiv.org/pdf/2206.00364.pdf
+            timesteps=c_noise,  # Eq. 7 of https://arxiv.org/pdf/2206.00364.pdf
+            **condition.to_dict(),
+        )
+        logvar = self.model.logvar(c_noise)
+        x0_pred = batch_mul(c_skip, xt) + batch_mul(c_out, net_output)
+        # get noise prediction based on sde
+        eps_pred = batch_mul(xt - x0_pred, 1.0 / sigma)
+        return DenoisePrediction(x0_pred, eps_pred, logvar)
+    def generate_samples_from_batch(
+        self,
+        data_batch: Dict,
+        guidance: float = 1.5,
+        seed: int = 1,
+        state_shape: Tuple | None = None,
+        n_sample: int | None = None,
+        is_negative_prompt: bool = False,
+        num_steps: int = 35,
+        solver_option: COMMON_SOLVER_OPTIONS = "2ab",
+        x_sigma_max: Optional[torch.Tensor] = None,
+        sigma_max: float | None = None,
+    ) -> Tensor:
+        """Generate samples from a data batch using diffusion sampling.
+        This function generates samples from either image or video data batches using diffusion sampling.
+        It handles both conditional and unconditional generation with classifier-free guidance.
+        Args:
+            data_batch (Dict): Raw data batch from the training data loader
+            guidance (float, optional): Classifier-free guidance weight. Defaults to 1.5.
+            seed (int, optional): Random seed for reproducibility. Defaults to 1.
+            state_shape (Tuple | None, optional): Shape of the state tensor. Uses self.state_shape if None. Defaults to None.
+            n_sample (int | None, optional): Number of samples to generate. Defaults to None.
+            is_negative_prompt (bool, optional): Whether to use negative prompt for unconditional generation. Defaults to False.
+            num_steps (int, optional): Number of diffusion sampling steps. Defaults to 35.
+            solver_option (COMMON_SOLVER_OPTIONS, optional): Differential equation solver option. Defaults to "2ab" (multistep solver).
+            x_sigma_max (Optional[torch.Tensor], optional): Initial noisy tensor. If None, randomly initialized. Defaults to None.
+            sigma_max (float | None, optional): Maximum noise level. Uses self.sde.sigma_max if None. Defaults to None.
+        Returns:
+            Tensor: Generated samples after diffusion sampling
+        """
+        x0_fn = self.get_x0_fn_from_batch(data_batch, guidance, is_negative_prompt=is_negative_prompt)
+        if sigma_max is None:
+            sigma_max = self.sde.sigma_max
+        else:
+            log.info("Using provided sigma_max for diffusion sampling.")
+        if x_sigma_max is None:
+            x_sigma_max = (
+                misc.arch_invariant_rand(
+                    (n_sample,) + tuple(state_shape),
+                    torch.float32,
+                    self.tensor_kwargs["device"],
+                    seed,
+                )
+                * sigma_max
+            )
+        samples = self.sampler(
+            x0_fn, x_sigma_max, num_steps=num_steps, sigma_max=sigma_max, solver_option=solver_option
+        )
+        return samples

model_v2w.py ADDED Viewed

	@@ -0,0 +1,341 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Callable, Dict, Optional, Tuple, Union
+from Cosmos.utils import misc
+import torch
+from torch import Tensor
+from Cosmos.conditioner import VideoExtendCondition
+from cosmos1.models.diffusion.config.base.conditioner import VideoCondBoolConfig
+from cosmos1.models.diffusion.diffusion.functional.batch_ops import batch_mul
+from Cosmos.model_t2w import DiffusionT2WModel
+from Cosmos.utils import log
+@dataclass
+class VideoDenoisePrediction:
+    x0: torch.Tensor  # clean data prediction
+    eps: Optional[torch.Tensor] = None  # noise prediction
+    logvar: Optional[torch.Tensor] = None  # log variance of noise prediction, can be used a confidence / uncertainty
+    xt: Optional[torch.Tensor] = None  # input to the network, before muliply with c_in
+    x0_pred_replaced: Optional[torch.Tensor] = None  # x0 prediction with condition region replaced by gt_latent
+class DiffusionV2WModel(DiffusionT2WModel):
+    def __init__(self, config):
+        super().__init__(config)
+    def augment_conditional_latent_frames(
+        self,
+        condition: VideoExtendCondition,
+        cfg_video_cond_bool: VideoCondBoolConfig,
+        gt_latent: Tensor,
+        condition_video_augment_sigma_in_inference: float = 0.001,
+        sigma: Tensor = None,
+        seed: int = 1,
+    ) -> Union[VideoExtendCondition, Tensor]:
+        """Augments the conditional frames with noise during inference.
+        Args:
+            condition (VideoExtendCondition): condition object
+                condition_video_indicator: binary tensor indicating the region is condition(value=1) or generation(value=0). Bx1xTx1x1 tensor.
+                condition_video_input_mask: input mask for the network input, indicating the condition region. B,1,T,H,W tensor. will be concat with the input for the network.
+            cfg_video_cond_bool (VideoCondBoolConfig): video condition bool config
+            gt_latent (Tensor): ground truth latent tensor in shape B,C,T,H,W
+            condition_video_augment_sigma_in_inference (float): sigma for condition video augmentation in inference
+            sigma (Tensor): noise level for the generation region
+            seed (int): random seed for reproducibility
+        Returns:
+            VideoExtendCondition: updated condition object
+                condition_video_augment_sigma: sigma for the condition region, feed to the network
+            augment_latent (Tensor): augmented latent tensor in shape B,C,T,H,W
+        """
+        # Inference only, use fixed sigma for the condition region
+        assert (
+            condition_video_augment_sigma_in_inference is not None
+        ), "condition_video_augment_sigma_in_inference should be provided"
+        augment_sigma = condition_video_augment_sigma_in_inference
+        if augment_sigma >= sigma.flatten()[0]:
+            # This is a inference trick! If the sampling sigma is smaller than the augment sigma, we will start denoising the condition region together.
+            # This is achieved by setting all region as `generation`, i.e. value=0
+            log.debug("augment_sigma larger than sigma or other frame, remove condition")
+            condition.condition_video_indicator = condition.condition_video_indicator * 0
+        augment_sigma = torch.tensor([augment_sigma], **self.tensor_kwargs)
+        # Now apply the augment_sigma to the gt_latent
+        noise = misc.arch_invariant_rand(
+            gt_latent.shape,
+            torch.float32,
+            self.tensor_kwargs["device"],
+            seed,
+        )
+        augment_latent = gt_latent + noise * augment_sigma[:, None, None, None, None]
+        _, _, c_in_augment, _ = self.scaling(sigma=augment_sigma)
+        # Multiply the whole latent with c_in_augment
+        augment_latent_cin = batch_mul(augment_latent, c_in_augment)
+        # Since the whole latent will multiply with c_in later, we devide the value to cancel the effect
+        _, _, c_in, _ = self.scaling(sigma=sigma)
+        augment_latent_cin = batch_mul(augment_latent_cin, 1 / c_in)
+        return condition, augment_latent_cin
+    def denoise(
+        self,
+        noise_x: Tensor,
+        sigma: Tensor,
+        condition: VideoExtendCondition,
+        condition_video_augment_sigma_in_inference: float = 0.001,
+        seed: int = 1,
+    ) -> VideoDenoisePrediction:
+        """Denoises input tensor using conditional video generation.
+        Args:
+            noise_x (Tensor): Noisy input tensor.
+            sigma (Tensor): Noise level.
+            condition (VideoExtendCondition): Condition for denoising.
+            condition_video_augment_sigma_in_inference (float): sigma for condition video augmentation in inference
+            seed (int): Random seed for reproducibility
+        Returns:
+            VideoDenoisePrediction containing:
+            - x0: Denoised prediction
+            - eps: Noise prediction
+            - logvar: Log variance of noise prediction
+            - xt: Input before c_in multiplication
+            - x0_pred_replaced: x0 prediction with condition regions replaced by ground truth
+        """
+        assert (
+            condition.gt_latent is not None
+        ), f"find None gt_latent in condition, likely didn't call self.add_condition_video_indicator_and_video_input_mask when preparing the condition or this is a image batch but condition.data_type is wrong, get {noise_x.shape}"
+        gt_latent = condition.gt_latent
+        cfg_video_cond_bool: VideoCondBoolConfig = self.config.conditioner.video_cond_bool
+        condition_latent = gt_latent
+        # Augment the latent with different sigma value, and add the augment_sigma to the condition object if needed
+        condition, augment_latent = self.augment_conditional_latent_frames(
+            condition, cfg_video_cond_bool, condition_latent, condition_video_augment_sigma_in_inference, sigma, seed
+        )
+        condition_video_indicator = condition.condition_video_indicator  # [B, 1, T, 1, 1]
+        # Compose the model input with condition region (augment_latent) and generation region (noise_x)
+        new_noise_xt = condition_video_indicator * augment_latent + (1 - condition_video_indicator) * noise_x
+        # Call the abse model
+        denoise_pred = super().denoise(new_noise_xt, sigma, condition)
+        x0_pred_replaced = condition_video_indicator * gt_latent + (1 - condition_video_indicator) * denoise_pred.x0
+        x0_pred = x0_pred_replaced
+        return VideoDenoisePrediction(
+            x0=x0_pred,
+            eps=batch_mul(noise_x - x0_pred, 1.0 / sigma),
+            logvar=denoise_pred.logvar,
+            xt=new_noise_xt,
+            x0_pred_replaced=x0_pred_replaced,
+        )
+    def generate_samples_from_batch(
+        self,
+        data_batch: Dict,
+        guidance: float = 1.5,
+        seed: int = 1,
+        state_shape: Tuple | None = None,
+        n_sample: int | None = None,
+        is_negative_prompt: bool = False,
+        num_steps: int = 35,
+        condition_latent: Union[torch.Tensor, None] = None,
+        num_condition_t: Union[int, None] = None,
+        condition_video_augment_sigma_in_inference: float = None,
+        add_input_frames_guidance: bool = False,
+        x_sigma_max: Optional[torch.Tensor] = None,
+    ) -> Tensor:
+        """Generates video samples conditioned on input frames.
+        Args:
+            data_batch: Input data dictionary
+            guidance: Classifier-free guidance scale
+            seed: Random seed for reproducibility
+            state_shape: Shape of output tensor (defaults to model's state shape)
+            n_sample: Number of samples to generate (defaults to batch size)
+            is_negative_prompt: Whether to use negative prompting
+            num_steps: Number of denoising steps
+            condition_latent: Conditioning frames tensor (B,C,T,H,W)
+            num_condition_t: Number of frames to condition on
+            condition_video_augment_sigma_in_inference: Noise level for condition augmentation
+            add_input_frames_guidance: Whether to apply guidance to input frames
+            x_sigma_max: Maximum noise level tensor
+        Returns:
+            Generated video samples tensor
+        """
+        if n_sample is None:
+            input_key = self.input_data_key
+            n_sample = data_batch[input_key].shape[0]
+        if state_shape is None:
+            log.debug(f"Default Video state shape is used. {self.state_shape}")
+            state_shape = self.state_shape
+        assert condition_latent is not None, "condition_latent should be provided"
+        x0_fn = self.get_x0_fn_from_batch_with_condition_latent(
+            data_batch,
+            guidance,
+            is_negative_prompt=is_negative_prompt,
+            condition_latent=condition_latent,
+            num_condition_t=num_condition_t,
+            condition_video_augment_sigma_in_inference=condition_video_augment_sigma_in_inference,
+            add_input_frames_guidance=add_input_frames_guidance,
+            seed=seed,
+        )
+        if x_sigma_max is None:
+            x_sigma_max = (
+                misc.arch_invariant_rand(
+                    (n_sample,) + tuple(state_shape),
+                    torch.float32,
+                    self.tensor_kwargs["device"],
+                    seed,
+                )
+                * self.sde.sigma_max
+            )
+        samples = self.sampler(x0_fn, x_sigma_max, num_steps=num_steps, sigma_max=self.sde.sigma_max)
+        return samples
+    def get_x0_fn_from_batch_with_condition_latent(
+        self,
+        data_batch: Dict,
+        guidance: float = 1.5,
+        is_negative_prompt: bool = False,
+        condition_latent: torch.Tensor = None,
+        num_condition_t: Union[int, None] = None,
+        condition_video_augment_sigma_in_inference: float = None,
+        add_input_frames_guidance: bool = False,
+        seed: int = 1,
+    ) -> Callable:
+        """Creates denoising function for conditional video generation.
+        Args:
+            data_batch: Input data dictionary
+            guidance: Classifier-free guidance scale
+            is_negative_prompt: Whether to use negative prompting
+            condition_latent: Conditioning frames tensor (B,C,T,H,W)
+            num_condition_t: Number of frames to condition on
+            condition_video_augment_sigma_in_inference: Noise level for condition augmentation
+            add_input_frames_guidance: Whether to apply guidance to input frames
+            seed: Random seed for reproducibility
+        Returns:
+            Function that takes noisy input and noise level and returns denoised prediction
+        """
+        if is_negative_prompt:
+            condition, uncondition = self.conditioner.get_condition_with_negative_prompt(data_batch)
+        else:
+            condition, uncondition = self.conditioner.get_condition_uncondition(data_batch)
+        condition.video_cond_bool = True
+        condition = self.add_condition_video_indicator_and_video_input_mask(
+            condition_latent, condition, num_condition_t
+        )
+        uncondition.video_cond_bool = False if add_input_frames_guidance else True
+        uncondition = self.add_condition_video_indicator_and_video_input_mask(
+            condition_latent, uncondition, num_condition_t
+        )
+        def x0_fn(noise_x: torch.Tensor, sigma: torch.Tensor) -> torch.Tensor:
+            cond_x0 = self.denoise(
+                noise_x,
+                sigma,
+                condition,
+                condition_video_augment_sigma_in_inference=condition_video_augment_sigma_in_inference,
+                seed=seed,
+            ).x0_pred_replaced
+            uncond_x0 = self.denoise(
+                noise_x,
+                sigma,
+                uncondition,
+                condition_video_augment_sigma_in_inference=condition_video_augment_sigma_in_inference,
+                seed=seed,
+            ).x0_pred_replaced
+            return cond_x0 + guidance * (cond_x0 - uncond_x0)
+        return x0_fn
+    def add_condition_video_indicator_and_video_input_mask(
+        self, latent_state: torch.Tensor, condition: VideoExtendCondition, num_condition_t: Union[int, None] = None
+    ) -> VideoExtendCondition:
+        """Adds conditioning masks to VideoExtendCondition object.
+        Creates binary indicators and input masks for conditional video generation.
+        Args:
+            latent_state: Input latent tensor (B,C,T,H,W)
+            condition: VideoExtendCondition object to update
+            num_condition_t: Number of frames to condition on
+        Returns:
+            Updated VideoExtendCondition with added masks:
+            - condition_video_indicator: Binary tensor marking condition regions
+            - condition_video_input_mask: Input mask for network
+            - gt_latent: Ground truth latent tensor
+        """
+        T = latent_state.shape[2]
+        latent_dtype = latent_state.dtype
+        condition_video_indicator = torch.zeros(1, 1, T, 1, 1, device=latent_state.device).type(
+            latent_dtype
+        )  # 1 for condition region
+        # Only in inference to decide the condition region
+        assert num_condition_t is not None, "num_condition_t should be provided"
+        assert num_condition_t <= T, f"num_condition_t should be less than T, get {num_condition_t}, {T}"
+        log.debug(
+            f"condition_location first_n, num_condition_t {num_condition_t}, condition.video_cond_bool {condition.video_cond_bool}"
+        )
+        condition_video_indicator[:, :, :num_condition_t] += 1.0
+        condition.gt_latent = latent_state
+        condition.condition_video_indicator = condition_video_indicator
+        B, C, T, H, W = latent_state.shape
+        # Create additional input_mask channel, this will be concatenated to the input of the network
+        # See design doc section (Implementation detail A.1 and A.2) for visualization
+        ones_padding = torch.ones((B, 1, T, H, W), dtype=latent_state.dtype, device=latent_state.device)
+        zeros_padding = torch.zeros((B, 1, T, H, W), dtype=latent_state.dtype, device=latent_state.device)
+        assert condition.video_cond_bool is not None, "video_cond_bool should be set"
+        # The input mask indicate whether the input is conditional region or not
+        if condition.video_cond_bool:  # Condition one given video frames
+            condition.condition_video_input_mask = (
+                condition_video_indicator * ones_padding + (1 - condition_video_indicator) * zeros_padding
+            )
+        else:  # Unconditional case, use for cfg
+            condition.condition_video_input_mask = zeros_padding
+        return condition

t5_text_encoder.py ADDED Viewed

	@@ -0,0 +1,108 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List, Tuple, Union
+import torch
+import transformers
+from transformers import T5EncoderModel, T5TokenizerFast
+from Cosmos.utils import log
+transformers.logging.set_verbosity_error()
+class CosmosT5TextEncoder(torch.nn.Module):
+    """Handles T5 text encoding operations."""
+    def __init__(self, model_name: str = "google-t5/t5-11b", device: str = "cuda", cache_dir: str = "~/.cache"):
+        """Initializes the T5 tokenizer and encoder.
+        Args:
+            model_name: The name of the T5 model to use.
+            device: The device to use for computations.
+        """
+        super().__init__()
+        try:
+            self.tokenizer = T5TokenizerFast.from_pretrained(model_name, cache_dir=cache_dir)
+            self.text_encoder = T5EncoderModel.from_pretrained(model_name, cache_dir=cache_dir).to(device)
+        except Exception as e:
+            log.warning(f"Failed to load T5 model using cache_dir '{cache_dir}', falling back to default location: {e}")
+            self.tokenizer = T5TokenizerFast.from_pretrained(model_name)
+            self.text_encoder = T5EncoderModel.from_pretrained(model_name).to(device)
+        self.text_encoder.eval()
+        self.device = device
+    @torch.inference_mode()
+    def encode_prompts(
+        self, prompts: Union[str, List[str]], max_length: int = 512
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Encodes text prompts into hidden state representations using a T5 encoder.
+        This function tokenizes the input prompts, processes them through a T5 text encoder,
+        and returns the last hidden states. The encoded outputs beyond the actual sequence
+        length are zero-padded. All prompts in a batch are padded to max_length.
+        Args:
+            prompts: Input text to encode. Can be a single string or a list of strings.
+            max_length: Maximum sequence length for tokenization and padding. Longer
+                sequences will be truncated. Defaults to 512.
+            return_mask: If True, returns the attention mask along with encoded text.
+                Defaults to False.
+        Returns:
+            If return_mask is False:
+                torch.Tensor: Encoded text embeddings of shape (batch_size, max_length, hidden_size).
+            If return_mask is True:
+                tuple[torch.Tensor, torch.Tensor]: A tuple containing:
+                    - Encoded text embeddings of shape (batch_size, max_length, hidden_size)
+                    - Attention mask of shape (batch_size, max_length) as boolean tensor
+        Raises:
+            ValueError: If the input prompts list is empty.
+        Example:
+            >>> encoder = CosmosT5TextEncoder()
+            >>> prompts = ["Hello world", "Another example"]
+            >>> embeddings = encoder.encode_prompts(prompts, max_length=128)
+        """
+        if isinstance(prompts, str):
+            prompts = [prompts]
+        if not prompts:
+            raise ValueError("The input prompt list is empty.")
+        batch_encoding = self.tokenizer.batch_encode_plus(
+            prompts,
+            return_tensors="pt",
+            truncation=True,
+            padding="max_length",
+            max_length=max_length,
+            return_length=True,
+            return_offsets_mapping=False,
+        )
+        input_ids = batch_encoding.input_ids.to(self.device)
+        attn_mask = batch_encoding.attention_mask.to(self.device)
+        outputs = self.text_encoder(input_ids=input_ids, attention_mask=attn_mask)
+        encoded_text = outputs.last_hidden_state
+        lengths = attn_mask.sum(dim=1).cpu()
+        for batch_id in range(encoded_text.shape[0]):
+            encoded_text[batch_id][lengths[batch_id] :] = 0
+        return encoded_text, attn_mask

text2world.py ADDED Viewed

	@@ -0,0 +1,161 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+from Cosmos.utils import misc
+import torch
+from Cosmos.inference_utils import add_common_arguments, validate_args
+from Cosmos.world_generation_pipeline import DiffusionText2WorldGenerationPipeline
+from Cosmos.utils import log
+from Cosmos.utils.io import read_prompts_from_file, save_video
+torch.enable_grad(False)
+def parse_arguments() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Text to world generation demo script")
+    # Add common arguments
+    add_common_arguments(parser)
+    # Add text2world specific arguments
+    parser.add_argument(
+        "--diffusion_transformer_dir",
+        type=str,
+        default="Cosmos-1.0-Diffusion-7B-Text2World",
+        help="DiT model weights directory name relative to checkpoint_dir",
+        choices=[
+            "Cosmos-1.0-Diffusion-7B-Text2World",
+            "Cosmos-1.0-Diffusion-14B-Text2World",
+        ],
+    )
+    parser.add_argument(
+        "--prompt_upsampler_dir",
+        type=str,
+        default="Cosmos-1.0-Prompt-Upsampler-12B-Text2World",
+        help="Prompt upsampler weights directory relative to checkpoint_dir",
+    )
+    parser.add_argument(
+        "--word_limit_to_skip_upsampler",
+        type=int,
+        default=250,
+        help="Skip prompt upsampler for better robustness if the number of words in the prompt is greater than this value",
+    )
+    return parser.parse_args()
+def demo(cfg):
+    """Run text-to-world generation demo.
+    This function handles the main text-to-world generation pipeline, including:
+    - Setting up the random seed for reproducibility
+    - Initializing the generation pipeline with the provided configuration
+    - Processing single or multiple prompts from input
+    - Generating videos from text prompts
+    - Saving the generated videos and corresponding prompts to disk
+    Args:
+        cfg (argparse.Namespace): Configuration namespace containing:
+            - Model configuration (checkpoint paths, model settings)
+            - Generation parameters (guidance, steps, dimensions)
+            - Input/output settings (prompts, save paths)
+            - Performance options (model offloading settings)
+    The function will save:
+        - Generated MP4 video files
+        - Text files containing the processed prompts
+    If guardrails block the generation, a critical log message is displayed
+    and the function continues to the next prompt if available.
+    """
+    misc.set_random_seed(cfg.seed)
+    inference_type = "text2world"
+    validate_args(cfg, inference_type)
+    # Initialize text2world generation model pipeline
+    pipeline = DiffusionText2WorldGenerationPipeline(
+        inference_type=inference_type,
+        checkpoint_dir=cfg.checkpoint_dir,
+        checkpoint_name=cfg.diffusion_transformer_dir,
+        prompt_upsampler_dir=cfg.prompt_upsampler_dir,
+        enable_prompt_upsampler=not cfg.disable_prompt_upsampler,
+        offload_network=cfg.offload_diffusion_transformer,
+        offload_tokenizer=cfg.offload_tokenizer,
+        offload_text_encoder_model=cfg.offload_text_encoder_model,
+        offload_prompt_upsampler=cfg.offload_prompt_upsampler,
+        offload_guardrail_models=cfg.offload_guardrail_models,
+        guidance=cfg.guidance,
+        num_steps=cfg.num_steps,
+        height=cfg.height,
+        width=cfg.width,
+        fps=cfg.fps,
+        num_video_frames=cfg.num_video_frames,
+        seed=cfg.seed,
+    )
+    # Handle multiple prompts if prompt file is provided
+    if cfg.batch_input_path:
+        log.info(f"Reading batch inputs from path: {args.batch_input_path}")
+        prompts = read_prompts_from_file(cfg.batch_input_path)
+    else:
+        # Single prompt case
+        prompts = [{"prompt": cfg.prompt}]
+    os.makedirs(cfg.video_save_folder, exist_ok=True)
+    for i, input_dict in enumerate(prompts):
+        current_prompt = input_dict.get("prompt", None)
+        if current_prompt is None:
+            log.critical("Prompt is missing, skipping world generation.")
+            continue
+        # Generate video
+        generated_output = pipeline.generate(current_prompt, cfg.negative_prompt, cfg.word_limit_to_skip_upsampler)
+        if generated_output is None:
+            log.critical("Guardrail blocked text2world generation.")
+            continue
+        video, prompt = generated_output
+        if cfg.batch_input_path:
+            video_save_path = os.path.join(cfg.video_save_folder, f"{i}.mp4")
+            prompt_save_path = os.path.join(cfg.video_save_folder, f"{i}.txt")
+        else:
+            video_save_path = os.path.join(cfg.video_save_folder, f"{cfg.video_save_name}.mp4")
+            prompt_save_path = os.path.join(cfg.video_save_folder, f"{cfg.video_save_name}.txt")
+        # Save video
+        save_video(
+            video=video,
+            fps=cfg.fps,
+            H=cfg.height,
+            W=cfg.width,
+            video_save_quality=5,
+            video_save_path=video_save_path,
+        )
+        # Save prompt to text file alongside video
+        with open(prompt_save_path, "wb") as f:
+            f.write(prompt.encode("utf-8"))
+        log.info(f"Saved video to {video_save_path}")
+        log.info(f"Saved prompt to {prompt_save_path}")
+if __name__ == "__main__":
+    args = parse_arguments()
+    demo(args)

text2world_prompt_upsampler_inference.py ADDED Viewed

	@@ -0,0 +1,157 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This demo script is used to run inference for Cosmos-1.0-Prompt-Upsampler-12B-Text2World.
+Command:
+    PYTHONPATH=$(pwd) python cosmos1/models/diffusion/prompt_upsampler/text2world_prompt_upsampler_inference.py
+"""
+import argparse
+import os
+import re
+from cosmos1.models.autoregressive.configs.base.model_config import create_text_model_config
+from cosmos1.models.autoregressive.model import AutoRegressiveModel
+from cosmos1.models.diffusion.prompt_upsampler.inference import chat_completion
+from Cosmos import guardrail_presets as guardrail_presets
+from Cosmos.utils import log
+def create_prompt_upsampler(checkpoint_dir: str) -> AutoRegressiveModel:
+    model_config, tokenizer_config = create_text_model_config(
+        model_ckpt_path=os.path.join(checkpoint_dir, "model.pt"),
+        tokenizer_path=os.path.join(checkpoint_dir),
+        model_family="mistral",
+        model_size="12b",
+        is_instruct_model=True,
+        max_batch_size=1,
+        rope_dim="1D",
+        add_special_tokens=True,
+        max_seq_len=1024,
+        pytorch_rope_version="v1",
+    )
+    log.debug(f"Text prompt upsampler model config: {model_config}")
+    # Create and return a LLM instance
+    return AutoRegressiveModel.build(
+        model_config=model_config,
+        tokenizer_config=tokenizer_config,
+    ).to("cuda")
+def run_chat_completion(model: AutoRegressiveModel, input: str, temperature: float = 0.01):
+    """
+    text2world prompt upsampler model is finetuned for chat.
+    During training, the context window for the initial prompt upsampler models is 512 tokens. For inference, we set max_seq_len to 1024 to accommodate longer inputs.
+    Setting `max_gen_len` is optional as the finetuned models can naturally determine when to stop generating.
+    """
+    dialogs = [[{"role": "user", "content": f"Upsample the short caption to a long caption: {str(input)}"}]]
+    results = chat_completion(
+        model,
+        dialogs,
+        max_gen_len=512,
+        temperature=temperature,
+        top_p=None,
+        top_k=None,
+        logprobs=False,
+    )
+    upsampled_prompt = str(clean_text(results[0]["generation"]["content"]))
+    return upsampled_prompt
+def clean_text(text: str) -> str:
+    """Clean the text by removing prefixes, suffixes, formatting markers, and normalizing whitespace."""
+    # Replace all variations of newlines with a space
+    text = text.replace("\n", " ").replace("\r", " ")
+    # Use a regex to find sections of the form '- **...**'
+    pattern = r"(- \*\*)(.*?)(\*\*)"
+    def replacement(match: re.Match[str]) -> str:
+        content = match.group(2)  # The text inside - ** and **
+        words = re.findall(r"\w+", content)
+        if len(words) < 10:
+            # If fewer than 10 words, remove the entire '- **...**' portion
+            return ""
+        else:
+            # If 10 or more words, keep the entire section as it is
+            return match.group(0)
+    text = re.sub(pattern, replacement, text)
+    # Remove common prefixes
+    prefixes = ["Caption:", "#####", "####", "- ", "* ", ","]
+    for prefix in prefixes:
+        # lstrip(prefix) won't strip entire strings, but character sets.
+        # For more reliable prefix removal, do:
+        if text.startswith(prefix):
+            text = text[len(prefix) :].lstrip()
+    # Remove extra spaces
+    text = " ".join(text.split())
+    # Strip any remaining leading/trailing punctuation, whitespace, and quotes
+    text = text.strip(' -,*:"\'"“”')
+    return text
+def parse_args():
+    parser = argparse.ArgumentParser(description="Run prompt upsampler inference")
+    parser.add_argument("--input", type=str, default="A dog is playing with a ball.")
+    parser.add_argument("--temperature", type=float, default=0.01, help="Inference temperature")
+    parser.add_argument(
+        "--checkpoint_dir", type=str, default="checkpoints", help="Base directory containing model checkpoints"
+    )
+    parser.add_argument(
+        "--prompt_upsampler_dir",
+        type=str,
+        default="Cosmos-1.0-Prompt-Upsampler-12B-Text2World",
+        help="Prompt upsampler weights directory relative to checkpoint_dir",
+    )
+    parser.add_argument(
+        "--guardrail_dir",
+        type=str,
+        default="Cosmos-1.0-Guardrail",
+        help="Guardrail weights directory relative to checkpoint_dir",
+    )
+    return parser.parse_args()
+def main(args):
+    guardrail_runner = guardrail_presets.create_text_guardrail_runner(
+        os.path.join(args.checkpoint_dir, args.guardrail_dir)
+    )
+    is_safe = guardrail_presets.run_text_guardrail(args.input, guardrail_runner)
+    if not is_safe:
+        log.critical("Input text prompt is not safe.")
+        return
+    prompt_upsampler = create_prompt_upsampler(os.path.join(args.checkpoint_dir, args.prompt_upsampler_dir))
+    upsampled_prompt = run_chat_completion(prompt_upsampler, args.input, temperature=args.temperature)
+    is_safe = guardrail_presets.run_text_guardrail(upsampled_prompt, guardrail_runner)
+    if not is_safe:
+        log.critical("Upsampled text prompt is not safe.")
+        return
+    log.info(f"Upsampled prompt: {upsampled_prompt}")
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)

types.py ADDED Viewed

	@@ -0,0 +1,28 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Optional
+import torch
+@dataclass
+class DenoisePrediction:
+    x0: torch.Tensor  # clean data prediction
+    eps: Optional[torch.Tensor] = None  # noise prediction
+    logvar: Optional[torch.Tensor] = None  # log variance of noise prediction, can be used a confidence / uncertainty

video2world.py ADDED Viewed

	@@ -0,0 +1,179 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+from Cosmos.utils import misc
+import torch
+from Cosmos.inference_utils import add_common_arguments, check_input_frames, validate_args
+from Cosmos.world_generation_pipeline import DiffusionVideo2WorldGenerationPipeline
+from Cosmos.utils import log
+from Cosmos.utils.io import read_prompts_from_file, save_video
+torch.enable_grad(False)
+def parse_arguments() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(description="Video to world generation demo script")
+    # Add common arguments
+    add_common_arguments(parser)
+    # Add video2world specific arguments
+    parser.add_argument(
+        "--diffusion_transformer_dir",
+        type=str,
+        default="Cosmos-1.0-Diffusion-7B-Video2World",
+        help="DiT model weights directory name relative to checkpoint_dir",
+        choices=[
+            "Cosmos-1.0-Diffusion-7B-Video2World",
+            "Cosmos-1.0-Diffusion-14B-Video2World",
+        ],
+    )
+    parser.add_argument(
+        "--prompt_upsampler_dir",
+        type=str,
+        default="Pixtral-12B",
+        help="Prompt upsampler weights directory relative to checkpoint_dir",
+    )
+    parser.add_argument(
+        "--input_image_or_video_path",
+        type=str,
+        help="Input video/image path for generating a single video",
+    )
+    parser.add_argument(
+        "--num_input_frames",
+        type=int,
+        default=1,
+        help="Number of input frames for video2world prediction",
+        choices=[1, 9],
+    )
+    return parser.parse_args()
+def demo(cfg):
+    """Run video-to-world generation demo.
+    This function handles the main video-to-world generation pipeline, including:
+    - Setting up the random seed for reproducibility
+    - Initializing the generation pipeline with the provided configuration
+    - Processing single or multiple prompts/images/videos from input
+    - Generating videos from prompts and images/videos
+    - Saving the generated videos and corresponding prompts to disk
+    Args:
+        cfg (argparse.Namespace): Configuration namespace containing:
+            - Model configuration (checkpoint paths, model settings)
+            - Generation parameters (guidance, steps, dimensions)
+            - Input/output settings (prompts/images/videos, save paths)
+            - Performance options (model offloading settings)
+    The function will save:
+        - Generated MP4 video files
+        - Text files containing the processed prompts
+    If guardrails block the generation, a critical log message is displayed
+    and the function continues to the next prompt if available.
+    """
+    misc.set_random_seed(cfg.seed)
+    inference_type = "video2world"
+    validate_args(cfg, inference_type)
+    # Initialize video2world generation model pipeline
+    pipeline = DiffusionVideo2WorldGenerationPipeline(
+        inference_type=inference_type,
+        checkpoint_dir=cfg.checkpoint_dir,
+        checkpoint_name=cfg.diffusion_transformer_dir,
+        prompt_upsampler_dir=cfg.prompt_upsampler_dir,
+        enable_prompt_upsampler=not cfg.disable_prompt_upsampler,
+        offload_network=cfg.offload_diffusion_transformer,
+        offload_tokenizer=cfg.offload_tokenizer,
+        offload_text_encoder_model=cfg.offload_text_encoder_model,
+        offload_prompt_upsampler=cfg.offload_prompt_upsampler,
+        offload_guardrail_models=cfg.offload_guardrail_models,
+        guidance=cfg.guidance,
+        num_steps=cfg.num_steps,
+        height=cfg.height,
+        width=cfg.width,
+        fps=cfg.fps,
+        num_video_frames=cfg.num_video_frames,
+        seed=cfg.seed,
+        num_input_frames=cfg.num_input_frames,
+    )
+    # Handle multiple prompts if prompt file is provided
+    if cfg.batch_input_path:
+        log.info(f"Reading batch inputs from path: {args.batch_input_path}")
+        prompts = read_prompts_from_file(cfg.batch_input_path)
+    else:
+        # Single prompt case
+        prompts = [{"prompt": cfg.prompt, "visual_input": cfg.input_image_or_video_path}]
+    os.makedirs(cfg.video_save_folder, exist_ok=True)
+    for i, input_dict in enumerate(prompts):
+        current_prompt = input_dict.get("prompt", None)
+        if current_prompt is None and cfg.disable_prompt_upsampler:
+            log.critical("Prompt is missing, skipping world generation.")
+            continue
+        current_image_or_video_path = input_dict.get("visual_input", None)
+        if current_image_or_video_path is None:
+            log.critical("Visual input is missing, skipping world generation.")
+            continue
+        # Check input frames
+        if not check_input_frames(current_image_or_video_path, cfg.num_input_frames):
+            continue
+        # Generate video
+        generated_output = pipeline.generate(
+            prompt=current_prompt,
+            image_or_video_path=current_image_or_video_path,
+            negative_prompt=cfg.negative_prompt,
+        )
+        if generated_output is None:
+            log.critical("Guardrail blocked video2world generation.")
+            continue
+        video, prompt = generated_output
+        if cfg.batch_input_path:
+            video_save_path = os.path.join(cfg.video_save_folder, f"{i}.mp4")
+            prompt_save_path = os.path.join(cfg.video_save_folder, f"{i}.txt")
+        else:
+            video_save_path = os.path.join(cfg.video_save_folder, f"{cfg.video_save_name}.mp4")
+            prompt_save_path = os.path.join(cfg.video_save_folder, f"{cfg.video_save_name}.txt")
+        # Save video
+        save_video(
+            video=video,
+            fps=cfg.fps,
+            H=cfg.height,
+            W=cfg.width,
+            video_save_quality=5,
+            video_save_path=video_save_path,
+        )
+        # Save prompt to text file alongside video
+        with open(prompt_save_path, "wb") as f:
+            f.write(prompt.encode("utf-8"))
+        log.info(f"Saved video to {video_save_path}")
+        log.info(f"Saved prompt to {prompt_save_path}")
+if __name__ == "__main__":
+    args = parse_arguments()
+    demo(args)

video2world_hf.py ADDED Viewed

	@@ -0,0 +1,283 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+from Cosmos.utils import misc
+import torch
+from Cosmos.inference_utils import add_common_arguments, check_input_frames, validate_args
+from Cosmos.world_generation_pipeline import DiffusionVideo2WorldGenerationPipeline
+from Cosmos.utils import log
+from Cosmos.utils.io import read_prompts_from_file, save_video
+from Cosmos.download_diffusion import main as download_diffusion
+from transformers import PreTrainedModel, PretrainedConfig
+torch.enable_grad(False)
+#custom config class
+class DiffusionVideo2WorldConfig(PretrainedConfig):
+    model_type = "DiffusionVideo2World"
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.checkpoint_dir = kwargs.get("checkpoint_dir", "checkpoints")
+        self.tokenizer_dir = kwargs.get("tokenizer_dir", "Cosmos-1.0-Tokenizer-CV8x8x8")
+        self.video_save_name = kwargs.get("video_save_name", "output")
+        self.video_save_folder = kwargs.get("video_save_folder", "outputs/")
+        self.prompt = kwargs.get("prompt", None)
+        self.batch_input_path = kwargs.get("batch_input_path", None)
+        self.negative_prompt = kwargs.get("negative_prompt", None)
+        self.num_steps = kwargs.get("num_steps", 35)
+        self.guidance = kwargs.get("guidance", 7)
+        self.num_video_frames = kwargs.get("num_video_frames", 121)
+        self.height = kwargs.get("height", 704)
+        self.width = kwargs.get("width", 1280)
+        self.fps = kwargs.get("fps", 24)
+        self.seed = kwargs.get("seed", 1)
+        self.disable_prompt_upsampler = kwargs.get("disable_prompt_upsampler", False)
+        self.offload_diffusion_transformer = kwargs.get("offload_diffusion_transformer", False)
+        self.offload_tokenizer = kwargs.get("offload_tokenizer", False)
+        self.offload_text_encoder_model = kwargs.get("offload_text_encoder_model", False)
+        self.offload_prompt_upsampler = kwargs.get("offload_prompt_upsampler", False)
+        self.offload_guardrail_models = kwargs.get("offload_guardrail_models", False)
+        self.diffusion_transformer_dir = kwargs.get("diffusion_transformer_dir", "Cosmos-1.0-Diffusion-7B-Video2World")
+        self.prompt_upsampler_dir = kwargs.get("prompt_upsampler_dir", "Pixtral-12B")
+        self.input_image_or_video_path = kwargs.get("input_image_or_video_path", None)
+        self.num_input_frames = kwargs.get("num_input_frames", 1)
+class DiffusionVideo2World(PreTrainedModel):
+    config_class = DiffusionVideo2WorldConfig
+    def __init__(self, config=DiffusionVideo2WorldConfig()):
+        super().__init__(config)
+        cfg = config
+        misc.set_random_seed(cfg.seed)
+        inference_type = "video2world"
+        validate_args(cfg, inference_type)
+        self.pipeline = DiffusionVideo2WorldGenerationPipeline(
+            inference_type=inference_type,
+            checkpoint_dir=cfg.checkpoint_dir,
+            checkpoint_name=cfg.diffusion_transformer_dir,
+            prompt_upsampler_dir=cfg.prompt_upsampler_dir,
+            enable_prompt_upsampler=not cfg.disable_prompt_upsampler,
+            offload_network=cfg.offload_diffusion_transformer,
+            offload_tokenizer=cfg.offload_tokenizer,
+            offload_text_encoder_model=cfg.offload_text_encoder_model,
+            offload_prompt_upsampler=cfg.offload_prompt_upsampler,
+            offload_guardrail_models=cfg.offload_guardrail_models,
+            guidance=cfg.guidance,
+            num_steps=cfg.num_steps,
+            height=cfg.height,
+            width=cfg.width,
+            fps=cfg.fps,
+            num_video_frames=cfg.num_video_frames,
+            seed=cfg.seed,
+            num_input_frames=cfg.num_input_frames,
+        )
+    def forward(self):
+        cfg = self.config
+        # Handle multiple prompts if prompt file is provided
+        if cfg.batch_input_path:
+            log.info(f"Reading batch inputs from path: {args.batch_input_path}")
+            prompts = read_prompts_from_file(cfg.batch_input_path)
+        else:
+            # Single prompt case
+            prompts = [{"prompt": cfg.prompt, "visual_input": cfg.input_image_or_video_path}]
+        os.makedirs(cfg.video_save_folder, exist_ok=True)
+        for i, input_dict in enumerate(prompts):
+            current_prompt = input_dict.get("prompt", None)
+            if current_prompt is None and cfg.disable_prompt_upsampler:
+                log.critical("Prompt is missing, skipping world generation.")
+                continue
+            current_image_or_video_path = input_dict.get("visual_input", None)
+            if current_image_or_video_path is None:
+                log.critical("Visual input is missing, skipping world generation.")
+                continue
+            # Check input frames
+            if not check_input_frames(current_image_or_video_path, cfg.num_input_frames):
+                continue
+            # Generate video
+            generated_output = pipeline.generate(
+                prompt=current_prompt,
+                image_or_video_path=current_image_or_video_path,
+                negative_prompt=cfg.negative_prompt,
+            )
+            if generated_output is None:
+                log.critical("Guardrail blocked video2world generation.")
+                continue
+            video, prompt = generated_output
+            if cfg.batch_input_path:
+                video_save_path = os.path.join(cfg.video_save_folder, f"{i}.mp4")
+                prompt_save_path = os.path.join(cfg.video_save_folder, f"{i}.txt")
+            else:
+                video_save_path = os.path.join(cfg.video_save_folder, f"{cfg.video_save_name}.mp4")
+                prompt_save_path = os.path.join(cfg.video_save_folder, f"{cfg.video_save_name}.txt")
+            # Save video
+            save_video(
+                video=video,
+                fps=cfg.fps,
+                H=cfg.height,
+                W=cfg.width,
+                video_save_quality=5,
+                video_save_path=video_save_path,
+            )
+            # Save prompt to text file alongside video
+            with open(prompt_save_path, "wb") as f:
+                f.write(prompt.encode("utf-8"))
+            log.info(f"Saved video to {video_save_path}")
+            log.info(f"Saved prompt to {prompt_save_path}")
+    def save_pretrained(self, save_directory, **kwargs):
+        # We don't save anything, but need this function to override
+        pass
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        config = kwargs["config"]
+        other_args = kwargs.copy()
+        other_args.pop("config")
+        config.update(other_args)
+        model_sizes = ["7B",] if "7B" in config.diffusion_transformer_dir else ["14B",]
+        model_types = ["Video2World",]
+        download_diffusion(model_types, model_sizes, config.checkpoint_dir)
+        model = cls(config)
+        return model
+def demo(cfg):
+    """Run video-to-world generation demo.
+    This function handles the main video-to-world generation pipeline, including:
+    - Setting up the random seed for reproducibility
+    - Initializing the generation pipeline with the provided configuration
+    - Processing single or multiple prompts/images/videos from input
+    - Generating videos from prompts and images/videos
+    - Saving the generated videos and corresponding prompts to disk
+    Args:
+        cfg (argparse.Namespace): Configuration namespace containing:
+            - Model configuration (checkpoint paths, model settings)
+            - Generation parameters (guidance, steps, dimensions)
+            - Input/output settings (prompts/images/videos, save paths)
+            - Performance options (model offloading settings)
+    The function will save:
+        - Generated MP4 video files
+        - Text files containing the processed prompts
+    If guardrails block the generation, a critical log message is displayed
+    and the function continues to the next prompt if available.
+    """
+    misc.set_random_seed(cfg.seed)
+    inference_type = "video2world"
+    validate_args(cfg, inference_type)
+    # Initialize video2world generation model pipeline
+    pipeline = DiffusionVideo2WorldGenerationPipeline(
+        inference_type=inference_type,
+        checkpoint_dir=cfg.checkpoint_dir,
+        checkpoint_name=cfg.diffusion_transformer_dir,
+        prompt_upsampler_dir=cfg.prompt_upsampler_dir,
+        enable_prompt_upsampler=not cfg.disable_prompt_upsampler,
+        offload_network=cfg.offload_diffusion_transformer,
+        offload_tokenizer=cfg.offload_tokenizer,
+        offload_text_encoder_model=cfg.offload_text_encoder_model,
+        offload_prompt_upsampler=cfg.offload_prompt_upsampler,
+        offload_guardrail_models=cfg.offload_guardrail_models,
+        guidance=cfg.guidance,
+        num_steps=cfg.num_steps,
+        height=cfg.height,
+        width=cfg.width,
+        fps=cfg.fps,
+        num_video_frames=cfg.num_video_frames,
+        seed=cfg.seed,
+        num_input_frames=cfg.num_input_frames,
+    )
+    # Handle multiple prompts if prompt file is provided
+    if cfg.batch_input_path:
+        log.info(f"Reading batch inputs from path: {args.batch_input_path}")
+        prompts = read_prompts_from_file(cfg.batch_input_path)
+    else:
+        # Single prompt case
+        prompts = [{"prompt": cfg.prompt, "visual_input": cfg.input_image_or_video_path}]
+    os.makedirs(cfg.video_save_folder, exist_ok=True)
+    for i, input_dict in enumerate(prompts):
+        current_prompt = input_dict.get("prompt", None)
+        if current_prompt is None and cfg.disable_prompt_upsampler:
+            log.critical("Prompt is missing, skipping world generation.")
+            continue
+        current_image_or_video_path = input_dict.get("visual_input", None)
+        if current_image_or_video_path is None:
+            log.critical("Visual input is missing, skipping world generation.")
+            continue
+        # Check input frames
+        if not check_input_frames(current_image_or_video_path, cfg.num_input_frames):
+            continue
+        # Generate video
+        generated_output = pipeline.generate(
+            prompt=current_prompt,
+            image_or_video_path=current_image_or_video_path,
+            negative_prompt=cfg.negative_prompt,
+        )
+        if generated_output is None:
+            log.critical("Guardrail blocked video2world generation.")
+            continue
+        video, prompt = generated_output
+        if cfg.batch_input_path:
+            video_save_path = os.path.join(cfg.video_save_folder, f"{i}.mp4")
+            prompt_save_path = os.path.join(cfg.video_save_folder, f"{i}.txt")
+        else:
+            video_save_path = os.path.join(cfg.video_save_folder, f"{cfg.video_save_name}.mp4")
+            prompt_save_path = os.path.join(cfg.video_save_folder, f"{cfg.video_save_name}.txt")
+        # Save video
+        save_video(
+            video=video,
+            fps=cfg.fps,
+            H=cfg.height,
+            W=cfg.width,
+            video_save_quality=5,
+            video_save_path=video_save_path,
+        )
+        # Save prompt to text file alongside video
+        with open(prompt_save_path, "wb") as f:
+            f.write(prompt.encode("utf-8"))
+        log.info(f"Saved video to {video_save_path}")
+        log.info(f"Saved prompt to {prompt_save_path}")
+if __name__ == "__main__":
+    args = parse_arguments()
+    demo(args)

video2world_prompt_upsampler_inference.py ADDED Viewed

	@@ -0,0 +1,167 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This demo script is used to run inference for Pixtral-12B.
+Command:
+    PYTHONPATH=$(pwd) python cosmos1/models/diffusion/prompt_upsampler/video2world_prompt_upsampler_inference.py
+"""
+import argparse
+import os
+from math import ceil
+from PIL import Image
+from cosmos1.models.autoregressive.configs.base.model_config import create_vision_language_model_config
+from cosmos1.models.autoregressive.model import AutoRegressiveModel
+from cosmos1.models.diffusion.prompt_upsampler.inference import chat_completion
+from Cosmos import guardrail_presets as guardrail_presets
+from Cosmos.utils import log
+from Cosmos.utils.io import load_from_fileobj
+def create_vlm_prompt_upsampler(
+    checkpoint_dir: str, tokenizer_ckpt_path: str = "mistral-community/pixtral-12b"
+) -> AutoRegressiveModel:
+    """
+    Load the fine-tuned pixtral model for SimReady.
+    If pixtral_ckpt is not provided, use the pretrained checkpoint.
+    """
+    model_ckpt_path = os.path.join(checkpoint_dir, "model.pt")
+    model_config, tokenizer_config = create_vision_language_model_config(
+        model_ckpt_path=model_ckpt_path,
+        tokenizer_ckpt_path=tokenizer_ckpt_path,
+        model_family="pixtral",
+        model_size="12b",
+        is_instruct_model=True,
+        max_batch_size=1,
+        max_seq_len=4300,
+        pytorch_rope_version="v1",
+    )
+    # during instantiate, the weights will be downloaded (if not already cached) and loaded
+    return AutoRegressiveModel.build(
+        model_config=model_config,
+        tokenizer_config=tokenizer_config,
+    ).to("cuda")
+def resize_image(image: Image.Image, max_size: int = 1024) -> Image.Image:
+    """
+    Ensure that the image is no larger than max_size in both dimensions.
+    """
+    image_width, image_height = image.size
+    max_width, max_height = max_size, max_size
+    ratio = max(image_width / max_width, image_height / max_height)
+    if ratio > 1:
+        image = image.resize((ceil(image_width / ratio), ceil(image_height / ratio)))
+    return image
+def prepare_dialog(image_or_video_path: str) -> list[dict]:
+    if image_or_video_path.endswith(".mp4"):
+        video_np, _ = load_from_fileobj(image_or_video_path, format="mp4")
+        image_frame = video_np[-1]
+        image = Image.fromarray(image_frame)
+    else:
+        image: Image.Image = Image.open(image_or_video_path)
+    image = resize_image(image, max_size=1024)
+    prompt = """\
+Your task is to transform a given prompt into a refined and concise video description, no more than 150 words.
+Focus only on the content, no filler words or descriptions on the style. Never mention things outside the video.
+    """.strip()
+    return [
+        {
+            "role": "user",
+            "content": "[IMG]\n" + prompt,
+            "images": [image],
+        }
+    ]
+def run_chat_completion(pixtral: AutoRegressiveModel, dialog: list[dict], **inference_args) -> str:
+    default_args = {
+        "max_gen_len": 400,
+        "temperature": 0,
+        "top_p": 0.9,
+        "logprobs": False,
+        "compile_sampling": False,
+        "compile_prefill": False,
+    }
+    default_args.update(inference_args)
+    results = chat_completion(
+        pixtral,
+        [dialog],
+        **default_args,
+    )
+    assert len(results) == 1
+    upsampled_prompt = str(results[0]["generation"]["content"])
+    return upsampled_prompt
+def parse_args():
+    parser = argparse.ArgumentParser(description="Run prompt upsampler inference")
+    parser.add_argument(
+        "--image_or_video_path", type=str, default="cosmos1/models/diffusion/assets/v1p0/video2world_input0.jpg"
+    )
+    parser.add_argument("--temperature", type=float, default=0.01, help="Inference temperature")
+    parser.add_argument("--top_p", type=float, default=0.9, help="Top-p value for top-p sampling")
+    parser.add_argument(
+        "--checkpoint_dir", type=str, default="checkpoints", help="Base directory containing model checkpoints"
+    )
+    parser.add_argument(
+        "--prompt_upsampler_dir",
+        type=str,
+        default="Pixtral-12B",
+        help="Prompt upsampler weights directory relative to checkpoint_dir",
+    )
+    parser.add_argument(
+        "--guardrail_dir",
+        type=str,
+        default="Cosmos-1.0-Guardrail",
+        help="Guardrail weights directory relative to checkpoint_dir",
+    )
+    return parser.parse_args()
+def main(args):
+    guardrail_runner = guardrail_presets.create_text_guardrail_runner(
+        os.path.join(args.checkpoint_dir, args.guardrail_dir)
+    )
+    pixtral = create_vlm_prompt_upsampler(os.path.join(args.checkpoint_dir, args.prompt_upsampler_dir))
+    dialog = prepare_dialog(args.image_or_video_path)
+    upsampled_prompt = run_chat_completion(
+        pixtral,
+        dialog,
+        max_gen_len=400,
+        temperature=args.temperature,
+        top_p=args.top_p,
+        logprobs=False,
+    )
+    is_safe = guardrail_presets.run_text_guardrail(upsampled_prompt, guardrail_runner)
+    if not is_safe:
+        log.critical("Upsampled text prompt is not safe.")
+        return
+    log.info(f"Upsampled prompt: {upsampled_prompt}")
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)

world_generation_pipeline.py ADDED Viewed

	@@ -0,0 +1,658 @@

+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import gc
+import os
+from typing import Any, Optional
+import numpy as np
+import torch
+from Cosmos.base_world_generation_pipeline import BaseWorldGenerationPipeline
+from Cosmos.inference_utils import (
+    generate_world_from_text,
+    generate_world_from_video,
+    get_condition_latent,
+    get_video_batch,
+    load_model_by_config,
+    load_network_model,
+    load_tokenizer_model,
+)
+from Cosmos.model_t2w import DiffusionT2WModel
+from Cosmos.model_v2w import DiffusionV2WModel
+from Cosmos.text2world_prompt_upsampler_inference import (
+    create_prompt_upsampler,
+    run_chat_completion,
+)
+from Cosmos.video2world_prompt_upsampler_inference import (
+    create_vlm_prompt_upsampler,
+    prepare_dialog,
+)
+from Cosmos.video2world_prompt_upsampler_inference import (
+    run_chat_completion as run_chat_completion_vlm,
+)
+from Cosmos.utils import log
+MODEL_NAME_DICT = {
+    "Cosmos-1.0-Diffusion-7B-Text2World": "Cosmos_1_0_Diffusion_Text2World_7B",
+    "Cosmos-1.0-Diffusion-14B-Text2World": "Cosmos_1_0_Diffusion_Text2World_14B",
+    "Cosmos-1.0-Diffusion-7B-Video2World": "Cosmos_1_0_Diffusion_Video2World_7B",
+    "Cosmos-1.0-Diffusion-14B-Video2World": "Cosmos_1_0_Diffusion_Video2World_14B",
+}
+class DiffusionText2WorldGenerationPipeline(BaseWorldGenerationPipeline):
+    def __init__(
+        self,
+        inference_type: str,
+        checkpoint_dir: str,
+        checkpoint_name: str,
+        prompt_upsampler_dir: Optional[str] = None,
+        enable_prompt_upsampler: bool = True,
+        enable_text_guardrail: bool = True,
+        enable_video_guardrail: bool = True,
+        offload_network: bool = False,
+        offload_tokenizer: bool = False,
+        offload_text_encoder_model: bool = False,
+        offload_prompt_upsampler: bool = False,
+        offload_guardrail_models: bool = False,
+        guidance: float = 7.0,
+        num_steps: int = 35,
+        height: int = 704,
+        width: int = 1280,
+        fps: int = 24,
+        num_video_frames: int = 121,
+        seed: int = 0,
+    ):
+        """Initialize the diffusion world generation pipeline.
+        Args:
+            inference_type: Type of world generation ('text2world' or 'video2world')
+            checkpoint_dir: Base directory containing model checkpoints
+            checkpoint_name: Name of the diffusion transformer checkpoint to use
+            prompt_upsampler_dir: Directory containing prompt upsampler model weights
+            enable_prompt_upsampler: Whether to use prompt upsampling
+            enable_text_guardrail: Whether to enable text guardrail
+            enable_video_guardrail: Whether to enable video guardrail
+            offload_network: Whether to offload diffusion transformer after inference
+            offload_tokenizer: Whether to offload tokenizer after inference
+            offload_text_encoder_model: Whether to offload T5 model after inference
+            offload_prompt_upsampler: Whether to offload prompt upsampler
+            offload_guardrail_models: Whether to offload guardrail models
+            guidance: Classifier-free guidance scale
+            num_steps: Number of diffusion sampling steps
+            height: Height of output video
+            width: Width of output video
+            fps: Frames per second of output video
+            num_video_frames: Number of frames to generate
+            seed: Random seed for sampling
+        """
+        assert inference_type in [
+            "text2world",
+            "video2world",
+        ], "Invalid inference_type, must be 'text2world' or 'video2world'"
+        self.model_name = MODEL_NAME_DICT[checkpoint_name]
+        self.guidance = guidance
+        self.num_steps = num_steps
+        self.height = height
+        self.width = width
+        self.fps = fps
+        self.num_video_frames = num_video_frames
+        self.seed = seed
+        super().__init__(
+            inference_type=inference_type,
+            checkpoint_dir=checkpoint_dir,
+            checkpoint_name=checkpoint_name,
+            enable_text_guardrail=enable_text_guardrail,
+            enable_video_guardrail=enable_video_guardrail,
+            offload_network=offload_network,
+            offload_tokenizer=offload_tokenizer,
+            offload_text_encoder_model=offload_text_encoder_model,
+            offload_guardrail_models=offload_guardrail_models,
+        )
+        self.prompt_upsampler_dir = prompt_upsampler_dir
+        self.enable_prompt_upsampler = enable_prompt_upsampler
+        self.offload_prompt_upsampler = offload_prompt_upsampler
+        self.prompt_upsampler = None
+        if enable_prompt_upsampler and not offload_prompt_upsampler:
+            self._load_prompt_upsampler_model()
+    def _load_prompt_upsampler_model(self):
+        self.prompt_upsampler = create_prompt_upsampler(
+            checkpoint_dir=os.path.join(self.checkpoint_dir, self.prompt_upsampler_dir),
+        )
+    def _load_model(self):
+        self.model = load_model_by_config(
+            config_job_name=self.model_name,
+            config_file="cosmos1/models/diffusion/config/config.py",
+            model_class=DiffusionT2WModel,
+        )
+    def _load_network(self):
+        load_network_model(self.model, f"{self.checkpoint_dir}/{self.checkpoint_name}/model.pt")
+    def _load_tokenizer(self):
+        load_tokenizer_model(self.model, f"{self.checkpoint_dir}/Cosmos-1.0-Tokenizer-CV8x8x8")
+    def _offload_prompt_upsampler_model(self):
+        """Move prompt enhancement model to CPU/disk.
+        Offloads prompt upsampling model after processing input
+        to reduce GPU memory usage.
+        """
+        if self.prompt_upsampler:
+            del self.prompt_upsampler
+            self.prompt_upsampler = None
+            gc.collect()
+            torch.cuda.empty_cache()
+    def _run_prompt_upsampler_on_prompt(self, prompt: str) -> str:
+        """Enhance the input prompt using the prompt upsampler model.
+        Args:
+            prompt: Raw text prompt to be enhanced
+        Returns:
+            str: Enhanced version of the input prompt with more descriptive details
+        """
+        upsampled_prompt = run_chat_completion(self.prompt_upsampler, prompt)
+        log.info(f"Upsampled prompt: {upsampled_prompt}")
+        return upsampled_prompt
+    def _run_prompt_upsampler_on_prompt_with_offload(self, *args: Any, **kwargs: Any) -> str:
+        """Enhance prompt with prompt upsampler model.
+        Args:
+            *args: Positional arguments
+            **kwargs: Keyword arguments
+        Returns:
+            Enhanced prompt string
+        """
+        if self.offload_prompt_upsampler:
+            self._load_prompt_upsampler_model()
+        enhanced_prompt = self._run_prompt_upsampler_on_prompt(*args, **kwargs)
+        if self.offload_prompt_upsampler:
+            self._offload_prompt_upsampler_model()
+        return enhanced_prompt
+    def _run_tokenizer_decoding(self, sample: torch.Tensor) -> np.ndarray:
+        """Decode latent samples to video frames using the tokenizer decoder.
+        Args:
+            sample: Latent tensor from diffusion model [B, C, T, H, W]
+        Returns:
+            np.ndarray: Decoded video frames as uint8 numpy array [T, H, W, C]
+                        with values in range [0, 255]
+        """
+        # Decode video
+        video = (1.0 + self.model.decode(sample)).clamp(0, 2) / 2  # [B, 3, T, H, W]
+        video = (video[0].permute(1, 2, 3, 0) * 255).to(torch.uint8).cpu().numpy()
+        return video
+    def _run_model(
+        self,
+        embedding: torch.Tensor,
+        negative_prompt_embedding: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """Generate video latents using the diffusion model.
+        Args:
+            embedding: Text embedding tensor from text encoder
+            negative_prompt_embedding: Optional embedding for negative prompt guidance
+        Returns:
+            torch.Tensor: Generated video latents before tokenizer decoding
+        Note:
+            The model and tokenizer are automatically offloaded after inference
+            if offloading is enabled in the config.
+        """
+        # Get video batch and state shape
+        data_batch, state_shape = get_video_batch(
+            model=self.model,
+            prompt_embedding=embedding,
+            negative_prompt_embedding=negative_prompt_embedding,
+            height=self.height,
+            width=self.width,
+            fps=self.fps,
+            num_video_frames=self.num_video_frames,
+        )
+        # Generate video frames
+        sample = generate_world_from_text(
+            model=self.model,
+            state_shape=state_shape,
+            is_negative_prompt=True if negative_prompt_embedding is not None else False,
+            data_batch=data_batch,
+            guidance=self.guidance,
+            num_steps=self.num_steps,
+            seed=self.seed,
+        )
+        return sample
+    def _run_model_with_offload(
+        self, prompt_embedding: torch.Tensor, negative_prompt_embedding: Optional[torch.Tensor] = None
+    ) -> np.ndarray:
+        """Generate world representation with automatic model offloading.
+        Wraps the core generation process with model loading/offloading logic
+        to minimize GPU memory usage during inference.
+        Args:
+            *args: Positional arguments passed to _run_model
+            **kwargs: Keyword arguments passed to _run_model
+        Returns:
+            np.ndarray: Generated world representation as numpy array
+        """
+        if self.offload_network:
+            self._load_network()
+        if self.offload_tokenizer:
+            self._load_tokenizer()
+        sample = self._run_model(prompt_embedding, negative_prompt_embedding)
+        if self.offload_network:
+            self._offload_network()
+        if self.offload_tokenizer:
+            self._load_tokenizer()
+        sample = self._run_tokenizer_decoding(sample)
+        if self.offload_tokenizer:
+            self._offload_tokenizer()
+        return sample
+    def generate(
+        self,
+        prompt: str,
+        negative_prompt: Optional[str] = None,
+        word_limit_to_skip_upsampler: Optional[int] = None,
+    ) -> tuple[np.ndarray, str] | None:
+        """Generate video from text prompt with optional negative prompt guidance.
+        Pipeline steps:
+        1. Run safety checks on input prompt
+        2. Enhance prompt using upsampler if enabled
+        3. Run safety checks on upsampled prompt if applicable
+        4. Convert prompt to embeddings
+        5. Generate video frames using diffusion
+        6. Run safety checks and apply face blur on generated video frames
+        Args:
+            prompt: Text description of desired video
+            negative_prompt: Optional text to guide what not to generate
+            word_limit_to_skip_upsampler: Skip prompt upsampler for better robustness if the number of words in the prompt is greater than this value
+        Returns:
+            tuple: (
+                Generated video frames as uint8 np.ndarray [T, H, W, C],
+                Final prompt used for generation (may be enhanced)
+            ), or None if content fails guardrail safety checks
+        """
+        log.info(f"Run with prompt: {prompt}")
+        log.info(f"Run with negative prompt: {negative_prompt}")
+        log.info(f"Run with prompt upsampler: {self.enable_prompt_upsampler}")
+        if self.enable_text_guardrail:
+            log.info("Run guardrail on prompt")
+            is_safe = self._run_guardrail_on_prompt_with_offload(prompt)
+            if not is_safe:
+                log.critical("Input text prompt is not safe")
+                return None
+            log.info("Pass guardrail on prompt")
+        # Enhance prompt
+        if self.enable_prompt_upsampler:
+            word_count = len(prompt.split())
+            if word_limit_to_skip_upsampler is None or word_count <= word_limit_to_skip_upsampler:
+                log.info("Run prompt upsampler on prompt")
+                prompt = self._run_prompt_upsampler_on_prompt_with_offload(prompt)
+                if self.enable_text_guardrail:
+                    log.info("Run guardrail on upsampled prompt")
+                    is_safe = self._run_guardrail_on_prompt_with_offload(prompt=prompt)
+                    if not is_safe:
+                        log.critical("Upsampled text prompt is not safe")
+                        return None
+                    log.info("Pass guardrail on upsampled prompt")
+            else:
+                log.info(
+                    f"Skip prompt upsampler for better robustness because the number of words ({word_count}) in the prompt is greater than {word_limit_to_skip_upsampler}"
+                )
+        log.info("Run text embedding on prompt")
+        if negative_prompt:
+            prompts = [prompt, negative_prompt]
+        else:
+            prompts = [prompt]
+        prompt_embeddings, _ = self._run_text_embedding_on_prompt_with_offload(prompts)
+        prompt_embedding = prompt_embeddings[0]
+        negative_prompt_embedding = prompt_embeddings[1] if negative_prompt else None
+        log.info("Finish text embedding on prompt")
+        # Generate video
+        log.info("Run generation")
+        video = self._run_model_with_offload(
+            prompt_embedding,
+            negative_prompt_embedding=negative_prompt_embedding,
+        )
+        log.info("Finish generation")
+        if self.enable_video_guardrail:
+            log.info("Run guardrail on generated video")
+            video = self._run_guardrail_on_video_with_offload(video)
+            if video is None:
+                log.critical("Generated video is not safe")
+                return None
+            log.info("Pass guardrail on generated video")
+        return video, prompt
+class DiffusionVideo2WorldGenerationPipeline(DiffusionText2WorldGenerationPipeline):
+    def __init__(
+        self,
+        inference_type: str,
+        checkpoint_dir: str,
+        checkpoint_name: str,
+        prompt_upsampler_dir: Optional[str] = None,
+        enable_prompt_upsampler: bool = True,
+        enable_text_guardrail: bool = True,
+        enable_video_guardrail: bool = True,
+        offload_network: bool = False,
+        offload_tokenizer: bool = False,
+        offload_text_encoder_model: bool = False,
+        offload_prompt_upsampler: bool = False,
+        offload_guardrail_models: bool = False,
+        guidance: float = 7.0,
+        num_steps: int = 35,
+        height: int = 704,
+        width: int = 1280,
+        fps: int = 24,
+        num_video_frames: int = 121,
+        seed: int = 0,
+        num_input_frames: int = 1,
+    ):
+        """Initialize diffusion world generation pipeline.
+        Args:
+            inference_type: Type of world generation ('text2world' or 'video2world')
+            checkpoint_dir: Base directory containing model checkpoints
+            checkpoint_name: Name of the diffusion transformer checkpoint to use
+            prompt_upsampler_dir: Directory containing prompt upsampler model weights
+            enable_prompt_upsampler: Whether to use prompt upsampling
+            enable_text_guardrail: Whether to enable text guardrail
+            enable_video_guardrail: Whether to enable video guardrail
+            offload_network: Whether to offload diffusion transformer after inference
+            offload_tokenizer: Whether to offload tokenizer after inference
+            offload_text_encoder_model: Whether to offload T5 model after inference
+            offload_prompt_upsampler: Whether to offload prompt upsampler
+            offload_guardrail_models: Whether to offload guardrail models
+            guidance: Classifier-free guidance scale
+            num_steps: Number of diffusion sampling steps
+            height: Height of output video
+            width: Width of output video
+            fps: Frames per second of output video
+            num_video_frames: Number of frames to generate
+            seed: Random seed for sampling
+            num_input_frames: Number of latent conditions
+        """
+        self.num_input_frames = num_input_frames
+        super().__init__(
+            inference_type=inference_type,
+            checkpoint_dir=checkpoint_dir,
+            checkpoint_name=checkpoint_name,
+            prompt_upsampler_dir=prompt_upsampler_dir,
+            enable_prompt_upsampler=enable_prompt_upsampler,
+            enable_text_guardrail=enable_text_guardrail,
+            enable_video_guardrail=enable_video_guardrail,
+            offload_network=offload_network,
+            offload_tokenizer=offload_tokenizer,
+            offload_text_encoder_model=offload_text_encoder_model,
+            offload_prompt_upsampler=offload_prompt_upsampler,
+            offload_guardrail_models=offload_guardrail_models,
+            guidance=guidance,
+            num_steps=num_steps,
+            height=height,
+            width=width,
+            fps=fps,
+            num_video_frames=num_video_frames,
+            seed=seed,
+        )
+    def _run_prompt_upsampler_on_prompt(self, image_or_video_path: str) -> str:
+        """Enhance the input prompt using visual context from the conditioning image.
+        Args:
+            image_or_video_path: Path to conditioning image or video used for visual context
+        Returns:
+            str: Enhanced prompt incorporating visual details from the image
+        """
+        dialog = prepare_dialog(image_or_video_path)
+        upsampled_prompt = run_chat_completion_vlm(
+            self.prompt_upsampler, dialog, max_gen_len=400, temperature=0.01, top_p=0.9, logprobs=False
+        )
+        log.info(f"Upsampled prompt: {upsampled_prompt}")
+        return upsampled_prompt
+    def _load_prompt_upsampler_model(self):
+        self.prompt_upsampler = create_vlm_prompt_upsampler(
+            checkpoint_dir=os.path.join(self.checkpoint_dir, self.prompt_upsampler_dir),
+        )
+    def _load_model(self):
+        self.model = load_model_by_config(
+            config_job_name=self.model_name,
+            config_file="cosmos1/models/diffusion/config/config.py",
+            model_class=DiffusionV2WModel,
+        )
+    def _run_model(
+        self,
+        embedding: torch.Tensor,
+        condition_latent: torch.Tensor,
+        negative_prompt_embedding: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        """Generate video frames using the diffusion model.
+        Args:
+            embedding: Text embedding tensor from T5 encoder
+            condition_latent: Latent tensor from conditioning image or video
+            negative_prompt_embedding: Optional embedding for negative prompt guidance
+        Returns:
+            Tensor of generated video frames
+        Note:
+            Model and tokenizer are automatically offloaded after inference
+            if offloading is enabled.
+        """
+        # Get video batch and state shape
+        data_batch, state_shape = get_video_batch(
+            model=self.model,
+            prompt_embedding=embedding,
+            negative_prompt_embedding=negative_prompt_embedding,
+            height=self.height,
+            width=self.width,
+            fps=self.fps,
+            num_video_frames=self.num_video_frames,
+        )
+        # Generate video frames
+        video = generate_world_from_video(
+            model=self.model,
+            state_shape=self.model.state_shape,
+            is_negative_prompt=True,
+            data_batch=data_batch,
+            guidance=self.guidance,
+            num_steps=self.num_steps,
+            seed=self.seed,
+            condition_latent=condition_latent,
+            num_input_frames=self.num_input_frames,
+        )
+        return video
+    def _run_tokenizer_encoding(self, image_or_video_path: str) -> torch.Tensor:
+        """
+        Encode image to latent space
+        Args:
+            image_or_video_path: Path to conditioning image
+        Returns:
+            torch.Tensor: Latent tensor from tokenizer encoding
+        """
+        condition_latent = get_condition_latent(
+            model=self.model,
+            input_image_or_video_path=image_or_video_path,
+            num_input_frames=self.num_input_frames,
+            state_shape=self.model.state_shape,
+        )
+        return condition_latent
+    def _run_model_with_offload(
+        self,
+        prompt_embedding: torch.Tensor,
+        image_or_video_path: str,
+        negative_prompt_embedding: Optional[torch.Tensor] = None,
+    ) -> np.ndarray:
+        """Generate world representation with automatic model offloading.
+        Wraps the core generation process with model loading/offloading logic
+        to minimize GPU memory usage during inference.
+        Args:
+            prompt_embedding: Text embedding tensor from T5 encoder
+            image_or_video_path: Path to conditioning image or video
+            negative_prompt_embedding: Optional embedding for negative prompt guidance
+        Returns:
+            np.ndarray: Generated world representation as numpy array
+        """
+        if self.offload_tokenizer:
+            self._load_tokenizer()
+        condition_latent = self._run_tokenizer_encoding(image_or_video_path)
+        if self.offload_network:
+            self._load_network()
+        sample = self._run_model(prompt_embedding, condition_latent, negative_prompt_embedding)
+        if self.offload_network:
+            self._offload_network()
+        sample = self._run_tokenizer_decoding(sample)
+        if self.offload_tokenizer:
+            self._offload_tokenizer()
+        return sample
+    def generate(
+        self,
+        prompt: str,
+        image_or_video_path: str,
+        negative_prompt: Optional[str] = None,
+    ) -> tuple[np.ndarray, str] | None:
+        """Generate video from text prompt and optional image.
+        Pipeline steps:
+        1. Run safety checks on input prompt
+        2. Enhance prompt using upsampler if enabled
+        3. Run safety checks on upsampled prompt if applicable
+        4. Convert prompt to embeddings
+        5. Generate video frames using diffusion
+        6. Run safety checks and apply face blur on generated video frames
+        Args:
+            prompt: Text description of desired video
+            image_or_video_path: Path to conditioning image or video
+            negative_prompt: Optional text to guide what not to generate
+        Returns:
+            tuple: (
+                Generated video frames as uint8 np.ndarray [T, H, W, C],
+                Final prompt used for generation (may be enhanced)
+            ), or None if content fails guardrail safety checks
+        """
+        log.info(f"Run with prompt: {prompt}")
+        log.info(f"Run with image or video path: {image_or_video_path}")
+        log.info(f"Run with negative prompt: {negative_prompt}")
+        log.info(f"Run with prompt upsampler: {self.enable_prompt_upsampler}")
+        if self.enable_text_guardrail and not self.enable_prompt_upsampler:
+            log.info("Run guardrail on prompt")
+            is_safe = self._run_guardrail_on_prompt_with_offload(prompt)
+            if not is_safe:
+                log.critical("Input text prompt is not safe")
+                return None
+            log.info("Pass guardrail on prompt")
+        # Enhance prompt
+        if self.enable_prompt_upsampler:
+            log.info("Run prompt upsampler on image or video, input prompt is not used")
+            prompt = self._run_prompt_upsampler_on_prompt_with_offload(image_or_video_path=image_or_video_path)
+            if self.enable_text_guardrail:
+                log.info("Run guardrail on upsampled prompt")
+                is_safe = self._run_guardrail_on_prompt_with_offload(prompt)
+                if not is_safe:
+                    log.critical("Upsampled text prompt is not safe")
+                    return None
+                log.info("Pass guardrail on upsampled prompt")
+        log.info("Run text embedding on prompt")
+        if negative_prompt:
+            prompts = [prompt, negative_prompt]
+        else:
+            prompts = [prompt]
+        prompt_embeddings, _ = self._run_text_embedding_on_prompt_with_offload(prompts)
+        prompt_embedding = prompt_embeddings[0]
+        negative_prompt_embedding = prompt_embeddings[1] if negative_prompt else None
+        log.info("Finish text embedding on prompt")
+        # Generate video
+        log.info("Run generation")
+        video = self._run_model_with_offload(
+            prompt_embedding,
+            negative_prompt_embedding=negative_prompt_embedding,
+            image_or_video_path=image_or_video_path,
+        )
+        log.info("Finish generation")
+        if self.enable_video_guardrail:
+            log.info("Run guardrail on generated video")
+            video = self._run_guardrail_on_video_with_offload(video)
+            if video is None:
+                log.critical("Generated video is not safe")
+                return None
+            log.info("Pass guardrail on generated video")
+        return video, prompt