ltx-2.3-mlx / embedded_config.json
dgrauet's picture
Upload MLX model via mlx-forge
99489e9 verified
{
"transformer": {
"_class_name": "AVTransformer3DModel",
"activation_fn": "gelu-approximate",
"attention_bias": true,
"attention_head_dim": 128,
"attention_type": "default",
"caption_channels": 3840,
"cross_attention_dim": 4096,
"double_self_attention": false,
"dropout": 0.0,
"in_channels": 128,
"norm_elementwise_affine": false,
"norm_eps": 1e-06,
"norm_num_groups": 32,
"num_attention_heads": 32,
"num_embeds_ada_norm": 1000,
"num_layers": 48,
"num_vector_embeds": null,
"only_cross_attention": false,
"cross_attention_norm": true,
"out_channels": 128,
"upcast_attention": false,
"use_linear_projection": false,
"qk_norm": "rms_norm",
"standardization_norm": "rms_norm",
"positional_embedding_type": "rope",
"positional_embedding_theta": 10000.0,
"positional_embedding_max_pos": [
20,
2048,
2048
],
"timestep_scale_multiplier": 1000,
"av_ca_timestep_scale_multiplier": 1000.0,
"causal_temporal_positioning": true,
"audio_num_attention_heads": 32,
"audio_attention_head_dim": 64,
"use_audio_video_cross_attention": true,
"share_ff": false,
"audio_out_channels": 128,
"audio_cross_attention_dim": 2048,
"audio_positional_embedding_max_pos": [
20
],
"av_cross_ada_norm": true,
"use_embeddings_connector": true,
"connector_attention_head_dim": 128,
"connector_num_attention_heads": 32,
"connector_num_layers": 8,
"connector_positional_embedding_max_pos": [
4096
],
"connector_num_learnable_registers": 128,
"connector_norm_output": true,
"use_middle_indices_grid": true,
"apply_gated_attention": true,
"connector_apply_gated_attention": true,
"caption_projection_first_linear": false,
"caption_projection_second_linear": false,
"caption_proj_input_norm": false,
"connector_learnable_registers_std": 1,
"caption_proj_before_connector": true,
"audio_connector_attention_head_dim": 64,
"audio_connector_num_attention_heads": 32,
"cross_attention_adaln": true,
"text_encoder_norm_type": "per_token_rms",
"rope_type": "split",
"frequencies_precision": "float64"
},
"vae": {
"_class_name": "CausalVideoAutoencoder",
"dims": 3,
"in_channels": 3,
"out_channels": 3,
"latent_channels": 128,
"encoder_blocks": [
[
"res_x",
{
"num_layers": 4
}
],
[
"compress_space_res",
{
"multiplier": 2
}
],
[
"res_x",
{
"num_layers": 6
}
],
[
"compress_time_res",
{
"multiplier": 2
}
],
[
"res_x",
{
"num_layers": 4
}
],
[
"compress_all_res",
{
"multiplier": 2
}
],
[
"res_x",
{
"num_layers": 2
}
],
[
"compress_all_res",
{
"multiplier": 1
}
],
[
"res_x",
{
"num_layers": 2
}
]
],
"decoder_blocks": [
[
"res_x",
{
"num_layers": 4
}
],
[
"compress_space",
{
"multiplier": 2
}
],
[
"res_x",
{
"num_layers": 6
}
],
[
"compress_time",
{
"multiplier": 2
}
],
[
"res_x",
{
"num_layers": 4
}
],
[
"compress_all",
{
"multiplier": 1
}
],
[
"res_x",
{
"num_layers": 2
}
],
[
"compress_all",
{
"multiplier": 2
}
],
[
"res_x",
{
"num_layers": 2
}
]
],
"scaling_factor": 1.0,
"norm_layer": "pixel_norm",
"patch_size": 4,
"latent_log_var": "uniform",
"use_quant_conv": false,
"causal_decoder": false,
"timestep_conditioning": false,
"normalize_latent_channels": false,
"encoder_base_channels": 128,
"decoder_base_channels": 128,
"spatial_padding_mode": "zeros"
},
"scheduler": {
"_class_name": "RectifiedFlowScheduler",
"_diffusers_version": "0.25.1",
"num_train_timesteps": 1000,
"shifting": null,
"base_resolution": null,
"sampler": "LinearQuadratic"
},
"audio_vae": {
"model": {
"params": {
"ddconfig": {
"double_z": true,
"mel_bins": 64,
"z_channels": 8,
"resolution": 256,
"downsample_time": false,
"in_channels": 2,
"out_ch": 2,
"ch": 128,
"ch_mult": [
1,
2,
4
],
"num_res_blocks": 2,
"attn_resolutions": [],
"dropout": 0.0,
"mid_block_add_attention": false,
"norm_type": "pixel",
"causality_axis": "height"
},
"sampling_rate": 16000
}
},
"preprocessing": {
"audio": {
"sampling_rate": 16000,
"max_wav_value": 32768.0,
"duration": 5.12,
"stereo": true,
"causal_padding": 3
},
"stft": {
"filter_length": 1024,
"hop_length": 160,
"win_length": 1024,
"causal": true
},
"mel": {
"n_mel_channels": 64,
"mel_fmin": 0,
"mel_fmax": 8000
}
}
},
"vocoder": {
"vocoder": {
"upsample_initial_channel": 1536,
"resblock": "AMP1",
"upsample_rates": [
5,
2,
2,
2,
2,
2
],
"resblock_kernel_sizes": [
3,
7,
11
],
"upsample_kernel_sizes": [
11,
4,
4,
4,
4,
4
],
"resblock_dilation_sizes": [
[
1,
3,
5
],
[
1,
3,
5
],
[
1,
3,
5
]
],
"stereo": true,
"use_tanh_at_final": false,
"activation": "snakebeta",
"use_bias_at_final": false
},
"bwe": {
"upsample_initial_channel": 512,
"resblock": "AMP1",
"upsample_rates": [
6,
5,
2,
2,
2
],
"resblock_kernel_sizes": [
3,
7,
11
],
"upsample_kernel_sizes": [
12,
11,
4,
4,
4
],
"resblock_dilation_sizes": [
[
1,
3,
5
],
[
1,
3,
5
],
[
1,
3,
5
]
],
"stereo": true,
"use_tanh_at_final": false,
"activation": "snakebeta",
"use_bias_at_final": false,
"apply_final_activation": false,
"input_sampling_rate": 16000,
"output_sampling_rate": 48000,
"hop_length": 80,
"n_fft": 512,
"win_size": 512,
"num_mels": 64
}
}
}