| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | from typing import Any, List, Union |
| |
|
| | import attrs |
| |
|
| | from .ar_configs_base_model import ModelConfig, TokenizerConfig |
| |
|
| |
|
| | @attrs.define(slots=False) |
| | class DataShapeConfig: |
| | latent_shape: list = [] |
| | num_video_frames: Union[None, int] = None |
| | height: Union[None, int] = None |
| | width: Union[None, int] = None |
| |
|
| |
|
| | @attrs.define(slots=False) |
| | class SamplingConfig: |
| | """ |
| | Sampling config |
| | Args: |
| | temperature (float): Temperature value for controlling randomness in sampling. Defaults to 0.6. |
| | top_p (float): Top-p probability threshold for nucleus sampling. Defaults to 0.9. |
| | logprobs (bool): Flag indicating whether to compute token log probabilities. Defaults to False. |
| | echo (bool): Flag indicating whether to include prompt tokens in the generated output. Defaults to False. |
| | |
| | """ |
| |
|
| | temperature: float = 0.6 |
| | top_k: int = None |
| | top_p: float = 0.9 |
| | compile_prefill: bool = False |
| | compile_sampling: bool = True |
| | logprobs: bool = False |
| | echo: bool = False |
| |
|
| |
|
| | @attrs.define(slots=False) |
| | class DiffusionDecoderSamplingConfig: |
| | """ |
| | Diffusion decoder sampling config |
| | Args: |
| | guidance (float): Guidance scale for the diffusion process. Controls how much the model follows the conditioning. Defaults to 0.8. |
| | sigma_min (float): Minimum noise level for the diffusion process. Defaults to 0.02. |
| | sigma (float): Initial noise level for the diffusion process. Defaults to 8. |
| | num_steps (int): Number of denoising steps to perform. Defaults to 35. |
| | overlap (int): Number of overlapping frames between video chunks during processing. Defaults to 2. |
| | continuous_tokenizer_channel (int): Number of channels in the continuous tokenizer of diffusion decoder. Defaults to 16. |
| | continuous_tokenizer_spatial_compression_ratio (int): Spatial compression ratio for the continuous tokenizer of diffusion decoder. Defaults to 8. |
| | dd_train_num_video_frames (int): Number of video frames used during training for diffusion decoder. Defaults to 57. |
| | """ |
| |
|
| | guidance: float = 1.8 |
| | sigma_min: float = 0.02 |
| | sigma: float = 8 |
| | num_steps: int = 15 |
| | overlap: int = 2 |
| | continuous_tokenizer_channel = 16 |
| | continuous_tokenizer_spatial_compression_ratio = 8 |
| | dd_train_num_video_frames: int = 57 |
| | max_iter: int = 99 |
| | fps: int = 24 |
| |
|
| |
|
| | @attrs.define(slots=False) |
| | class InferenceConfig: |
| | """ |
| | Inference config |
| | Args: |
| | model_config (ModelConfig): Model config |
| | tokenizer_config (TokenizerConfig): Tokenizer config |
| | ckpt_path (str): Path to the checkpoint |
| | latent_shape (list): Shape of the latent |
| | """ |
| |
|
| | model_config: ModelConfig = None |
| | tokenizer_config: TokenizerConfig = None |
| | ckpt_path: str = "" |
| | data_shape_config: DataShapeConfig = None |
| |
|
| | defaults: List[Any] = attrs.field( |
| | factory=lambda: [ |
| | "_self_", |
| | {"data_val": None}, |
| | {"data_shape_config": "video_shape_as_model_config"}, |
| | {"eval_job": None}, |
| | ] |
| | ) |
| |
|