Spaces:
Build error
Build error
| # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. | |
| # SPDX-License-Identifier: Apache-2.0 | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| from typing import Any, List, Optional, Union | |
| import attrs | |
| from cosmos_predict1.autoregressive.configs.base.model import ModelConfig, TokenizerConfig | |
| class DataShapeConfig: | |
| latent_shape: list = [] | |
| num_video_frames: Union[None, int] = None | |
| height: Union[None, int] = None | |
| width: Union[None, int] = None | |
| class SamplingConfig: | |
| """ | |
| Sampling config | |
| Args: | |
| temperature (float): Temperature value for controlling randomness in sampling. Defaults to 0.6. | |
| top_p (float): Top-p probability threshold for nucleus sampling. Defaults to 0.9. | |
| logprobs (bool): Flag indicating whether to compute token log probabilities. Defaults to False. | |
| echo (bool): Flag indicating whether to include prompt tokens in the generated output. Defaults to False. | |
| """ | |
| temperature: float = 0.6 | |
| top_k: int = None | |
| top_p: float = 0.9 | |
| compile_prefill: bool = False | |
| compile_sampling: bool = True | |
| logprobs: bool = False | |
| echo: bool = False | |
| class DiffusionDecoderSamplingConfig: | |
| """ | |
| Diffusion decoder sampling config | |
| Args: | |
| guidance (float): Guidance scale for the diffusion process. Controls how much the model follows the conditioning. Defaults to 0.8. | |
| sigma_min (float): Minimum noise level for the diffusion process. Defaults to 0.02. | |
| sigma (float): Initial noise level for the diffusion process. Defaults to 8. | |
| num_steps (int): Number of denoising steps to perform. Defaults to 35. | |
| overlap (int): Number of overlapping frames between video chunks during processing. Defaults to 2. | |
| continuous_tokenizer_channel (int): Number of channels in the continuous tokenizer of diffusion decoder. Defaults to 16. | |
| continuous_tokenizer_spatial_compression_ratio (int): Spatial compression ratio for the continuous tokenizer of diffusion decoder. Defaults to 8. | |
| dd_train_num_video_frames (int): Number of video frames used during training for diffusion decoder. Defaults to 57. | |
| """ | |
| guidance: float = 1.8 | |
| sigma_min: float = 0.02 | |
| sigma: float = 8 | |
| num_steps: int = 15 | |
| overlap: int = 2 | |
| continuous_tokenizer_channel = 16 | |
| continuous_tokenizer_spatial_compression_ratio = 8 | |
| dd_train_num_video_frames: int = 57 | |
| max_iter: int = 99 | |
| fps: int = 24 | |
| class InferenceConfig: | |
| """ | |
| Inference config | |
| Args: | |
| model_config (ModelConfig): Model config | |
| tokenizer_config (TokenizerConfig): Tokenizer config | |
| ckpt_path (str): Path to the checkpoint | |
| latent_shape (list): Shape of the latent | |
| """ | |
| model_config: ModelConfig = None | |
| tokenizer_config: TokenizerConfig = None | |
| ckpt_path: str = "" | |
| data_shape_config: DataShapeConfig = None | |
| defaults: List[Any] = attrs.field( | |
| factory=lambda: [ | |
| "_self_", | |
| {"data_val": None}, | |
| {"data_shape_config": "video_shape_as_model_config"}, | |
| {"eval_job": None}, | |
| ] | |
| ) | |