Upload CogVideoX-Fun-V1.5-5b-InP converted to MLX (bf16)

1478473 verified about 1 month ago

2.85 kB

	{
	"model_type": "cogvideox-fun-inpaint",
	"source": "alibaba-pai/CogVideoX-Fun-V1.5-5b-InP",
	"transformer": {
	"_class_name": "CogVideoXTransformer3DModel",
	"_diffusers_version": "0.32.0.dev0",
	"activation_fn": "gelu-approximate",
	"attention_bias": true,
	"attention_head_dim": 64,
	"dropout": 0.0,
	"flip_sin_to_cos": true,
	"freq_shift": 0,
	"in_channels": 33,
	"max_text_seq_length": 226,
	"norm_elementwise_affine": true,
	"norm_eps": 1e-05,
	"num_attention_heads": 48,
	"num_layers": 42,
	"out_channels": 16,
	"patch_bias": false,
	"patch_size": 2,
	"patch_size_t": 2,
	"sample_frames": 85,
	"sample_height": 384,
	"sample_width": 680,
	"spatial_interpolation_scale": 1.875,
	"temporal_compression_ratio": 4,
	"temporal_interpolation_scale": 1.0,
	"text_embed_dim": 4096,
	"time_embed_dim": 512,
	"timestep_activation_fn": "silu",
	"use_learned_positional_embeddings": false,
	"use_rotary_positional_embeddings": true,
	"add_noise_in_inpaint_model": true
	},
	"text_encoder": {
	"_name_or_path": "google/t5-v1_1-xxl",
	"architectures": [
	"T5EncoderModel"
	],
	"classifier_dropout": 0.0,
	"d_ff": 10240,
	"d_kv": 64,
	"d_model": 4096,
	"decoder_start_token_id": 0,
	"dense_act_fn": "gelu_new",
	"dropout_rate": 0.1,
	"eos_token_id": 1,
	"feed_forward_proj": "gated-gelu",
	"initializer_factor": 1.0,
	"is_encoder_decoder": true,
	"is_gated_act": true,
	"layer_norm_epsilon": 1e-06,
	"model_type": "t5",
	"num_decoder_layers": 24,
	"num_heads": 64,
	"num_layers": 24,
	"output_past": true,
	"pad_token_id": 0,
	"relative_attention_max_distance": 128,
	"relative_attention_num_buckets": 32,
	"tie_word_embeddings": false,
	"torch_dtype": "bfloat16",
	"transformers_version": "4.43.4",
	"use_cache": true,
	"vocab_size": 32128
	},
	"vae": {
	"_class_name": "AutoencoderKLCogVideoX",
	"_diffusers_version": "0.31.0.dev0",
	"act_fn": "silu",
	"block_out_channels": [
	128,
	256,
	256,
	512
	],
	"down_block_types": [
	"CogVideoXDownBlock3D",
	"CogVideoXDownBlock3D",
	"CogVideoXDownBlock3D",
	"CogVideoXDownBlock3D"
	],
	"force_upcast": true,
	"in_channels": 3,
	"latent_channels": 16,
	"latents_mean": null,
	"latents_std": null,
	"layers_per_block": 3,
	"norm_eps": 1e-06,
	"norm_num_groups": 32,
	"out_channels": 3,
	"sample_height": 480,
	"sample_width": 720,
	"scaling_factor": 0.7,
	"shift_factor": null,
	"temporal_compression_ratio": 4,
	"up_block_types": [
	"CogVideoXUpBlock3D",
	"CogVideoXUpBlock3D",
	"CogVideoXUpBlock3D",
	"CogVideoXUpBlock3D"
	],
	"use_post_quant_conv": false,
	"use_quant_conv": false
	}
	}