|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""The default image and video tokenizer configs.""" |
|
|
|
|
|
from cosmos_predict1.tokenizer.modules import ( |
|
|
ContinuousFormulation, |
|
|
Decoder3DType, |
|
|
DecoderType, |
|
|
DiscreteQuantizer, |
|
|
Encoder3DType, |
|
|
EncoderType, |
|
|
) |
|
|
|
|
|
continuous_image = dict( |
|
|
|
|
|
attn_resolutions=[32], |
|
|
|
|
|
channels=128, |
|
|
|
|
|
channels_mult=[2, 4, 4], |
|
|
dropout=0.0, |
|
|
in_channels=3, |
|
|
|
|
|
spatial_compression=16, |
|
|
|
|
|
num_res_blocks=2, |
|
|
out_channels=3, |
|
|
resolution=1024, |
|
|
patch_size=4, |
|
|
patch_method="haar", |
|
|
|
|
|
latent_channels=16, |
|
|
|
|
|
|
|
|
z_channels=16, |
|
|
|
|
|
|
|
|
z_factor=1, |
|
|
name="CI", |
|
|
|
|
|
|
|
|
formulation=ContinuousFormulation.AE.name, |
|
|
|
|
|
encoder=EncoderType.Default.name, |
|
|
|
|
|
decoder=DecoderType.Default.name, |
|
|
) |
|
|
continuous_image_8x8_360p = dict(continuous_image) |
|
|
continuous_image_8x8_360p["patch_size"] = 2 |
|
|
continuous_image_8x8_360p["spatial_compression"] = 8 |
|
|
|
|
|
continuous_image_16x16_360p = dict(continuous_image) |
|
|
continuous_image_16x16_360p["patch_size"] = 2 |
|
|
continuous_image_16x16_360p["spatial_compression"] = 16 |
|
|
|
|
|
|
|
|
discrete_image = dict( |
|
|
|
|
|
attn_resolutions=[32], |
|
|
|
|
|
channels=128, |
|
|
|
|
|
channels_mult=[2, 4, 4], |
|
|
dropout=0.0, |
|
|
in_channels=3, |
|
|
|
|
|
spatial_compression=16, |
|
|
|
|
|
num_res_blocks=2, |
|
|
out_channels=3, |
|
|
resolution=1024, |
|
|
patch_size=4, |
|
|
patch_method="haar", |
|
|
|
|
|
z_channels=256, |
|
|
|
|
|
|
|
|
z_factor=1, |
|
|
|
|
|
quantizer=DiscreteQuantizer.FSQ.name, |
|
|
|
|
|
|
|
|
embedding_dim=6, |
|
|
|
|
|
levels=[8, 8, 8, 5, 5, 5], |
|
|
|
|
|
num_quantizers=4, |
|
|
name="DI", |
|
|
|
|
|
encoder=EncoderType.Default.name, |
|
|
|
|
|
decoder=DecoderType.Default.name, |
|
|
) |
|
|
discrete_image_8x8_360p = dict(discrete_image) |
|
|
discrete_image_8x8_360p["patch_size"] = 2 |
|
|
discrete_image_8x8_360p["spatial_compression"] = 8 |
|
|
|
|
|
discrete_image_16x16_360p = dict(discrete_image) |
|
|
discrete_image_16x16_360p["patch_size"] = 2 |
|
|
discrete_image_16x16_360p["spatial_compression"] = 16 |
|
|
|
|
|
continuous_video = dict( |
|
|
attn_resolutions=[32], |
|
|
channels=128, |
|
|
channels_mult=[2, 4, 4], |
|
|
dropout=0.0, |
|
|
in_channels=3, |
|
|
num_res_blocks=2, |
|
|
out_channels=3, |
|
|
resolution=1024, |
|
|
patch_size=4, |
|
|
patch_method="haar", |
|
|
latent_channels=16, |
|
|
z_channels=16, |
|
|
z_factor=1, |
|
|
num_groups=1, |
|
|
legacy_mode=False, |
|
|
spatial_compression=8, |
|
|
temporal_compression=8, |
|
|
formulation=ContinuousFormulation.AE.name, |
|
|
encoder=Encoder3DType.FACTORIZED.name, |
|
|
decoder=Decoder3DType.FACTORIZED.name, |
|
|
name="CV", |
|
|
) |
|
|
|
|
|
continuous_video_8x8x8_720p = dict(continuous_video) |
|
|
continuous_video_8x8x8_720p["temporal_compression"] = 8 |
|
|
continuous_video_8x8x8_720p["spatial_compression"] = 8 |
|
|
|
|
|
continuous_video_4x8x8_360p = dict(continuous_video) |
|
|
continuous_video_4x8x8_360p["temporal_compression"] = 4 |
|
|
continuous_video_4x8x8_360p["spatial_compression"] = 8 |
|
|
continuous_video_4x8x8_360p["patch_size"] = 2 |
|
|
|
|
|
|
|
|
discrete_video = dict( |
|
|
attn_resolutions=[32], |
|
|
channels=128, |
|
|
channels_mult=[2, 4, 4], |
|
|
dropout=0.0, |
|
|
in_channels=3, |
|
|
num_res_blocks=2, |
|
|
out_channels=3, |
|
|
resolution=1024, |
|
|
patch_size=4, |
|
|
patch_method="haar", |
|
|
z_channels=16, |
|
|
z_factor=1, |
|
|
num_groups=1, |
|
|
legacy_mode=False, |
|
|
spatial_compression=16, |
|
|
temporal_compression=8, |
|
|
quantizer=DiscreteQuantizer.FSQ.name, |
|
|
embedding_dim=6, |
|
|
levels=[8, 8, 8, 5, 5, 5], |
|
|
encoder=Encoder3DType.FACTORIZED.name, |
|
|
decoder=Decoder3DType.FACTORIZED.name, |
|
|
name="DV", |
|
|
) |
|
|
|
|
|
discrete_video_8x16x16_720p = dict(discrete_video) |
|
|
discrete_video_8x16x16_720p["temporal_compression"] = 8 |
|
|
discrete_video_8x16x16_720p["spatial_compression"] = 16 |
|
|
|
|
|
discrete_video_4x8x8_360p = dict(discrete_video) |
|
|
discrete_video_4x8x8_360p["z_channels"] = 256 |
|
|
discrete_video_4x8x8_360p["temporal_compression"] = 4 |
|
|
discrete_video_4x8x8_360p["spatial_compression"] = 8 |
|
|
discrete_video_4x8x8_360p["patch_size"] = 2 |
|
|
|