OzzyGT HF Staff commited on
Commit
6cc7fd3
·
verified ·
1 Parent(s): 90e74ab

Upload 17 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer/tokenizer.json filter=lfs diff=lfs merge=lfs -text
audio_vae/config.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "AutoencoderKLLTX2Audio",
3
+ "_diffusers_version": "0.37.0.dev0",
4
+ "attn_resolutions": null,
5
+ "base_channels": 16,
6
+ "causality_axis": "height",
7
+ "ch_mult": [
8
+ 1,
9
+ 2
10
+ ],
11
+ "double_z": true,
12
+ "dropout": 0.0,
13
+ "in_channels": 2,
14
+ "is_causal": true,
15
+ "latent_channels": 4,
16
+ "mel_bins": 16,
17
+ "mel_hop_length": 160,
18
+ "mid_block_add_attention": false,
19
+ "norm_type": "pixel",
20
+ "num_res_blocks": 1,
21
+ "output_channels": 2,
22
+ "resolution": 256,
23
+ "sample_rate": 16000
24
+ }
audio_vae/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c49400e9ca1e90614af15569e829d361b42eaa1f8fc676dc9db4581be6462088
3
+ size 321628
connectors/config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "LTX2TextConnectors",
3
+ "_diffusers_version": "0.37.0.dev0",
4
+ "audio_connector_attention_head_dim": 32,
5
+ "audio_connector_num_attention_heads": 2,
6
+ "audio_connector_num_layers": 1,
7
+ "audio_connector_num_learnable_registers": 8,
8
+ "caption_channels": 64,
9
+ "causal_temporal_positioning": false,
10
+ "connector_rope_base_seq_len": 4096,
11
+ "rope_double_precision": true,
12
+ "rope_theta": 10000.0,
13
+ "rope_type": "split",
14
+ "text_proj_in_factor": 3,
15
+ "video_connector_attention_head_dim": 32,
16
+ "video_connector_num_attention_heads": 2,
17
+ "video_connector_num_layers": 1,
18
+ "video_connector_num_learnable_registers": 8
19
+ }
connectors/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6722757545bef47fb0b98ac1e6ff350b91a28d78b5abd6d1e208ed3071593d38
3
+ size 229688
scheduler/scheduler_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "FlowMatchEulerDiscreteScheduler",
3
+ "_diffusers_version": "0.37.0.dev0",
4
+ "base_image_seq_len": 1024,
5
+ "base_shift": 0.95,
6
+ "invert_sigmas": false,
7
+ "max_image_seq_len": 4096,
8
+ "max_shift": 2.05,
9
+ "num_train_timesteps": 1000,
10
+ "shift": 1.0,
11
+ "shift_terminal": null,
12
+ "stochastic_sampling": false,
13
+ "time_shift_type": "exponential",
14
+ "use_beta_sigmas": false,
15
+ "use_dynamic_shifting": false,
16
+ "use_exponential_sigmas": false,
17
+ "use_karras_sigmas": false
18
+ }
text_encoder/config.json ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Gemma3ForConditionalGeneration"
4
+ ],
5
+ "boi_token_index": 250,
6
+ "dtype": "bfloat16",
7
+ "eoi_token_index": 251,
8
+ "image_token_index": 252,
9
+ "initializer_range": 0.02,
10
+ "mm_tokens_per_image": 1,
11
+ "model_type": "gemma3",
12
+ "text_config": {
13
+ "_sliding_window_pattern": 6,
14
+ "attention_bias": false,
15
+ "attention_dropout": 0.0,
16
+ "attn_logit_softcapping": null,
17
+ "bos_token_id": 2,
18
+ "eos_token_id": 1,
19
+ "final_logit_softcapping": null,
20
+ "head_dim": 32,
21
+ "hidden_activation": "gelu_pytorch_tanh",
22
+ "hidden_size": 64,
23
+ "initializer_range": 0.02,
24
+ "intermediate_size": 128,
25
+ "layer_types": [
26
+ "full_attention",
27
+ "full_attention"
28
+ ],
29
+ "max_position_embeddings": 2048,
30
+ "model_type": "gemma3_text",
31
+ "num_attention_heads": 2,
32
+ "num_hidden_layers": 2,
33
+ "num_key_value_heads": 1,
34
+ "pad_token_id": 0,
35
+ "query_pre_attn_scalar": 256,
36
+ "rms_norm_eps": 1e-06,
37
+ "rope_parameters": {
38
+ "full_attention": {
39
+ "rope_theta": 1000000.0,
40
+ "rope_type": "default"
41
+ },
42
+ "rope_theta": null,
43
+ "rope_type": "default",
44
+ "sliding_attention": {
45
+ "rope_theta": 10000.0,
46
+ "rope_type": "default"
47
+ }
48
+ },
49
+ "sliding_window": 512,
50
+ "tie_word_embeddings": true,
51
+ "use_bidirectional_attention": false,
52
+ "use_cache": false,
53
+ "vocab_size": 262208
54
+ },
55
+ "tie_word_embeddings": true,
56
+ "transformers_version": "5.2.0",
57
+ "vision_config": {
58
+ "attention_dropout": 0.0,
59
+ "hidden_act": "gelu_pytorch_tanh",
60
+ "hidden_size": 64,
61
+ "image_size": 16,
62
+ "intermediate_size": 128,
63
+ "layer_norm_eps": 1e-06,
64
+ "model_type": "siglip_vision_model",
65
+ "num_attention_heads": 2,
66
+ "num_channels": 3,
67
+ "num_hidden_layers": 1,
68
+ "patch_size": 16
69
+ }
70
+ }
text_encoder/generation_config.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 2,
4
+ "eos_token_id": 1,
5
+ "output_attentions": false,
6
+ "output_hidden_states": false,
7
+ "pad_token_id": 0,
8
+ "transformers_version": "5.2.0",
9
+ "use_cache": false
10
+ }
text_encoder/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f0424d56c3c5c87a94610b6d470521136eccba1493d1606281abe646f2d3bd13
3
+ size 33960096
tokenizer/chat_template.jinja ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {{ bos_token }}
2
+ {%- if messages[0]['role'] == 'system' -%}
3
+ {%- if messages[0]['content'] is string -%}
4
+ {%- set first_user_prefix = messages[0]['content'] + '
5
+
6
+ ' -%}
7
+ {%- else -%}
8
+ {%- set first_user_prefix = messages[0]['content'][0]['text'] + '
9
+
10
+ ' -%}
11
+ {%- endif -%}
12
+ {%- set loop_messages = messages[1:] -%}
13
+ {%- else -%}
14
+ {%- set first_user_prefix = "" -%}
15
+ {%- set loop_messages = messages -%}
16
+ {%- endif -%}
17
+ {%- for message in loop_messages -%}
18
+ {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
19
+ {{ raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") }}
20
+ {%- endif -%}
21
+ {%- if (message['role'] == 'assistant') -%}
22
+ {%- set role = "model" -%}
23
+ {%- else -%}
24
+ {%- set role = message['role'] -%}
25
+ {%- endif -%}
26
+ {{ '<start_of_turn>' + role + '
27
+ ' + (first_user_prefix if loop.first else "") }}
28
+ {%- if message['content'] is string -%}
29
+ {{ message['content'] | trim }}
30
+ {%- elif message['content'] is iterable -%}
31
+ {%- for item in message['content'] -%}
32
+ {%- if item['type'] == 'image' -%}
33
+ {{ '<start_of_image>' }}
34
+ {%- elif item['type'] == 'text' -%}
35
+ {{ item['text'] | trim }}
36
+ {%- endif -%}
37
+ {%- endfor -%}
38
+ {%- else -%}
39
+ {{ raise_exception("Invalid content type") }}
40
+ {%- endif -%}
41
+ {{ '<end_of_turn>
42
+ ' }}
43
+ {%- endfor -%}
44
+ {%- if add_generation_prompt -%}
45
+ {{'<start_of_turn>model
46
+ '}}
47
+ {%- endif -%}
tokenizer/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a74aefb1dc1340a25f29ab8370384b9ed24b2d921d7749ece7bbcfcfdf00d497
3
+ size 33384443
tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "boi_token": "<start_of_image>",
4
+ "bos_token": "<bos>",
5
+ "clean_up_tokenization_spaces": false,
6
+ "eoi_token": "<end_of_image>",
7
+ "eos_token": "<eos>",
8
+ "image_token": "<image_soft_token>",
9
+ "is_local": true,
10
+ "mask_token": "<mask>",
11
+ "model_max_length": 1000000000000000019884624838656,
12
+ "model_specific_special_tokens": {
13
+ "boi_token": "<start_of_image>",
14
+ "eoi_token": "<end_of_image>",
15
+ "image_token": "<image_soft_token>"
16
+ },
17
+ "pad_token": "<pad>",
18
+ "processor_class": "Gemma3Processor",
19
+ "sp_model_kwargs": null,
20
+ "spaces_between_special_tokens": false,
21
+ "tokenizer_class": "GemmaTokenizer",
22
+ "unk_token": "<unk>",
23
+ "use_default_system_prompt": false
24
+ }
transformer/config.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "LTX2VideoTransformer3DModel",
3
+ "_diffusers_version": "0.37.0.dev0",
4
+ "activation_fn": "gelu-approximate",
5
+ "attention_bias": true,
6
+ "attention_head_dim": 32,
7
+ "attention_out_bias": true,
8
+ "audio_attention_head_dim": 32,
9
+ "audio_cross_attention_dim": 64,
10
+ "audio_hop_length": 160,
11
+ "audio_in_channels": 16,
12
+ "audio_num_attention_heads": 2,
13
+ "audio_out_channels": 16,
14
+ "audio_patch_size": 1,
15
+ "audio_patch_size_t": 1,
16
+ "audio_pos_embed_max_pos": 20,
17
+ "audio_sampling_rate": 16000,
18
+ "audio_scale_factor": 4,
19
+ "base_height": 2048,
20
+ "base_width": 2048,
21
+ "caption_channels": 64,
22
+ "causal_offset": 1,
23
+ "cross_attention_dim": 64,
24
+ "cross_attn_timestep_scale_multiplier": 1000,
25
+ "in_channels": 4,
26
+ "norm_elementwise_affine": false,
27
+ "norm_eps": 1e-06,
28
+ "num_attention_heads": 2,
29
+ "num_layers": 2,
30
+ "out_channels": 4,
31
+ "patch_size": 1,
32
+ "patch_size_t": 1,
33
+ "pos_embed_max_pos": 20,
34
+ "qk_norm": "rms_norm_across_heads",
35
+ "rope_double_precision": true,
36
+ "rope_theta": 10000.0,
37
+ "rope_type": "interleaved",
38
+ "timestep_scale_multiplier": 1000,
39
+ "vae_scale_factors": [
40
+ 8,
41
+ 32,
42
+ 32
43
+ ]
44
+ }
transformer/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ac96571eb5a10bf4a6756fe181f450e2efae73ac023f312556445ff83e7cd31
3
+ size 1164152
vae/config.json ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "AutoencoderKLLTX2Video",
3
+ "_diffusers_version": "0.37.0.dev0",
4
+ "block_out_channels": [
5
+ 32,
6
+ 64
7
+ ],
8
+ "decoder_block_out_channels": [
9
+ 32
10
+ ],
11
+ "decoder_causal": false,
12
+ "decoder_inject_noise": [
13
+ false,
14
+ false
15
+ ],
16
+ "decoder_layers_per_block": [
17
+ 1,
18
+ 1
19
+ ],
20
+ "decoder_spatial_padding_mode": "reflect",
21
+ "decoder_spatio_temporal_scaling": [
22
+ true
23
+ ],
24
+ "down_block_types": [
25
+ "LTX2VideoDownBlock3D",
26
+ "LTX2VideoDownBlock3D"
27
+ ],
28
+ "downsample_type": [
29
+ "spatial",
30
+ "temporal"
31
+ ],
32
+ "encoder_causal": true,
33
+ "encoder_spatial_padding_mode": "zeros",
34
+ "in_channels": 3,
35
+ "latent_channels": 4,
36
+ "layers_per_block": [
37
+ 1,
38
+ 1,
39
+ 1
40
+ ],
41
+ "out_channels": 3,
42
+ "patch_size": 4,
43
+ "patch_size_t": 1,
44
+ "resnet_norm_eps": 1e-06,
45
+ "scaling_factor": 1.0,
46
+ "spatial_compression_ratio": 32,
47
+ "spatio_temporal_scaling": [
48
+ true,
49
+ true
50
+ ],
51
+ "temporal_compression_ratio": 8,
52
+ "timestep_conditioning": false,
53
+ "upsample_factor": [
54
+ 2
55
+ ],
56
+ "upsample_residual": [
57
+ true
58
+ ]
59
+ }
vae/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e6671690f82856448442ec433c4037a5691d03c2985a06558b0d59c2b55f69c
3
+ size 1052218
vocoder/config.json ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "LTX2Vocoder",
3
+ "_diffusers_version": "0.37.0.dev0",
4
+ "hidden_channels": 32,
5
+ "in_channels": 32,
6
+ "leaky_relu_negative_slope": 0.1,
7
+ "out_channels": 2,
8
+ "output_sampling_rate": 24000,
9
+ "resnet_dilations": [
10
+ [
11
+ 1,
12
+ 3
13
+ ]
14
+ ],
15
+ "resnet_kernel_sizes": [
16
+ 3
17
+ ],
18
+ "upsample_factors": [
19
+ 2,
20
+ 2
21
+ ],
22
+ "upsample_kernel_sizes": [
23
+ 4,
24
+ 4
25
+ ]
26
+ }
vocoder/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c337dfbd31cccec2be3ebc22db65ee3058c5f344aa58793bc52e9ed1ae67523
3
+ size 29740