ltx-2.3-mlx / embedded_config.json

Upload MLX model via mlx-forge

a6816e3 verified 17 days ago

7.23 kB

	{
	"transformer": {
	"_class_name": "AVTransformer3DModel",
	"activation_fn": "gelu-approximate",
	"attention_bias": true,
	"attention_head_dim": 128,
	"attention_type": "default",
	"caption_channels": 3840,
	"cross_attention_dim": 4096,
	"double_self_attention": false,
	"dropout": 0.0,
	"in_channels": 128,
	"norm_elementwise_affine": false,
	"norm_eps": 1e-06,
	"norm_num_groups": 32,
	"num_attention_heads": 32,
	"num_embeds_ada_norm": 1000,
	"num_layers": 48,
	"num_vector_embeds": null,
	"only_cross_attention": false,
	"cross_attention_norm": true,
	"out_channels": 128,
	"upcast_attention": false,
	"use_linear_projection": false,
	"qk_norm": "rms_norm",
	"standardization_norm": "rms_norm",
	"positional_embedding_type": "rope",
	"positional_embedding_theta": 10000.0,
	"positional_embedding_max_pos": [
	20,
	2048,
	2048
	],
	"timestep_scale_multiplier": 1000,
	"av_ca_timestep_scale_multiplier": 1000.0,
	"causal_temporal_positioning": true,
	"audio_num_attention_heads": 32,
	"audio_attention_head_dim": 64,
	"use_audio_video_cross_attention": true,
	"share_ff": false,
	"audio_out_channels": 128,
	"audio_cross_attention_dim": 2048,
	"audio_positional_embedding_max_pos": [
	20
	],
	"av_cross_ada_norm": true,
	"use_embeddings_connector": true,
	"connector_attention_head_dim": 128,
	"connector_num_attention_heads": 32,
	"connector_num_layers": 8,
	"connector_positional_embedding_max_pos": [
	4096
	],
	"connector_num_learnable_registers": 128,
	"connector_norm_output": true,
	"use_middle_indices_grid": true,
	"apply_gated_attention": true,
	"connector_apply_gated_attention": true,
	"caption_projection_first_linear": false,
	"caption_projection_second_linear": false,
	"caption_proj_input_norm": false,
	"connector_learnable_registers_std": 1,
	"caption_proj_before_connector": true,
	"audio_connector_attention_head_dim": 64,
	"audio_connector_num_attention_heads": 32,
	"cross_attention_adaln": true,
	"text_encoder_norm_type": "per_token_rms",
	"rope_type": "split",
	"frequencies_precision": "float64"
	},
	"vae": {
	"_class_name": "CausalVideoAutoencoder",
	"dims": 3,
	"in_channels": 3,
	"out_channels": 3,
	"latent_channels": 128,
	"encoder_blocks": [
	[
	"res_x",
	{
	"num_layers": 4
	}
	],
	[
	"compress_space_res",
	{
	"multiplier": 2
	}
	],
	[
	"res_x",
	{
	"num_layers": 6
	}
	],
	[
	"compress_time_res",
	{
	"multiplier": 2
	}
	],
	[
	"res_x",
	{
	"num_layers": 4
	}
	],
	[
	"compress_all_res",
	{
	"multiplier": 2
	}
	],
	[
	"res_x",
	{
	"num_layers": 2
	}
	],
	[
	"compress_all_res",
	{
	"multiplier": 1
	}
	],
	[
	"res_x",
	{
	"num_layers": 2
	}
	]
	],
	"decoder_blocks": [
	[
	"res_x",
	{
	"num_layers": 4
	}
	],
	[
	"compress_space",
	{
	"multiplier": 2
	}
	],
	[
	"res_x",
	{
	"num_layers": 6
	}
	],
	[
	"compress_time",
	{
	"multiplier": 2
	}
	],
	[
	"res_x",
	{
	"num_layers": 4
	}
	],
	[
	"compress_all",
	{
	"multiplier": 1
	}
	],
	[
	"res_x",
	{
	"num_layers": 2
	}
	],
	[
	"compress_all",
	{
	"multiplier": 2
	}
	],
	[
	"res_x",
	{
	"num_layers": 2
	}
	]
	],
	"scaling_factor": 1.0,
	"norm_layer": "pixel_norm",
	"patch_size": 4,
	"latent_log_var": "uniform",
	"use_quant_conv": false,
	"causal_decoder": false,
	"timestep_conditioning": false,
	"normalize_latent_channels": false,
	"encoder_base_channels": 128,
	"decoder_base_channels": 128,
	"spatial_padding_mode": "zeros"
	},
	"scheduler": {
	"_class_name": "RectifiedFlowScheduler",
	"_diffusers_version": "0.25.1",
	"num_train_timesteps": 1000,
	"shifting": null,
	"base_resolution": null,
	"sampler": "LinearQuadratic"
	},
	"audio_vae": {
	"model": {
	"params": {
	"ddconfig": {
	"double_z": true,
	"mel_bins": 64,
	"z_channels": 8,
	"resolution": 256,
	"downsample_time": false,
	"in_channels": 2,
	"out_ch": 2,
	"ch": 128,
	"ch_mult": [
	1,
	2,
	4
	],
	"num_res_blocks": 2,
	"attn_resolutions": [],
	"dropout": 0.0,
	"mid_block_add_attention": false,
	"norm_type": "pixel",
	"causality_axis": "height"
	},
	"sampling_rate": 16000
	}
	},
	"preprocessing": {
	"audio": {
	"sampling_rate": 16000,
	"max_wav_value": 32768.0,
	"duration": 5.12,
	"stereo": true,
	"causal_padding": 3
	},
	"stft": {
	"filter_length": 1024,
	"hop_length": 160,
	"win_length": 1024,
	"causal": true
	},
	"mel": {
	"n_mel_channels": 64,
	"mel_fmin": 0,
	"mel_fmax": 8000
	}
	}
	},
	"vocoder": {
	"vocoder": {
	"upsample_initial_channel": 1536,
	"resblock": "AMP1",
	"upsample_rates": [
	5,
	2,
	2,
	2,
	2,
	2
	],
	"resblock_kernel_sizes": [
	3,
	7,
	11
	],
	"upsample_kernel_sizes": [
	11,
	4,
	4,
	4,
	4,
	4
	],
	"resblock_dilation_sizes": [
	[
	1,
	3,
	5
	],
	[
	1,
	3,
	5
	],
	[
	1,
	3,
	5
	]
	],
	"stereo": true,
	"use_tanh_at_final": false,
	"activation": "snakebeta",
	"use_bias_at_final": false
	},
	"bwe": {
	"upsample_initial_channel": 512,
	"resblock": "AMP1",
	"upsample_rates": [
	6,
	5,
	2,
	2,
	2
	],
	"resblock_kernel_sizes": [
	3,
	7,
	11
	],
	"upsample_kernel_sizes": [
	12,
	11,
	4,
	4,
	4
	],
	"resblock_dilation_sizes": [
	[
	1,
	3,
	5
	],
	[
	1,
	3,
	5
	],
	[
	1,
	3,
	5
	]
	],
	"stereo": true,
	"use_tanh_at_final": false,
	"activation": "snakebeta",
	"use_bias_at_final": false,
	"apply_final_activation": false,
	"input_sampling_rate": 16000,
	"output_sampling_rate": 48000,
	"hop_length": 80,
	"n_fft": 512,
	"win_size": 512,
	"num_mels": 64
	}
	}
	}