CalamitousFelicitousness commited on
Commit
bc5046c
·
verified ·
1 Parent(s): ddbe3b1

Upload folder using huggingface_hub

Browse files
Files changed (42) hide show
  1. .gitattributes +2 -0
  2. README.md +123 -0
  3. audio_vae/config.json +25 -0
  4. audio_vae/diffusion_pytorch_model.safetensors +3 -0
  5. connectors/config.json +19 -0
  6. connectors/diffusion_pytorch_model.safetensors +3 -0
  7. ltx2.3-open.png +3 -0
  8. model_index.json +36 -0
  9. scheduler/scheduler_config.json +18 -0
  10. text_encoder/config.json +114 -0
  11. text_encoder/diffusion_pytorch_model-00001-of-00012.safetensors +3 -0
  12. text_encoder/diffusion_pytorch_model-00002-of-00012.safetensors +3 -0
  13. text_encoder/diffusion_pytorch_model-00003-of-00012.safetensors +3 -0
  14. text_encoder/diffusion_pytorch_model-00004-of-00012.safetensors +3 -0
  15. text_encoder/diffusion_pytorch_model-00005-of-00012.safetensors +3 -0
  16. text_encoder/diffusion_pytorch_model-00006-of-00012.safetensors +3 -0
  17. text_encoder/diffusion_pytorch_model-00007-of-00012.safetensors +3 -0
  18. text_encoder/diffusion_pytorch_model-00008-of-00012.safetensors +3 -0
  19. text_encoder/diffusion_pytorch_model-00009-of-00012.safetensors +3 -0
  20. text_encoder/diffusion_pytorch_model-00010-of-00012.safetensors +3 -0
  21. text_encoder/diffusion_pytorch_model-00011-of-00012.safetensors +3 -0
  22. text_encoder/diffusion_pytorch_model-00012-of-00012.safetensors +3 -0
  23. text_encoder/diffusion_pytorch_model.safetensors.index.json +0 -0
  24. text_encoder/generation_config.json +11 -0
  25. tokenizer/added_tokens.json +3 -0
  26. tokenizer/chat_template.jinja +47 -0
  27. tokenizer/preprocessor_config.json +29 -0
  28. tokenizer/processor_config.json +4 -0
  29. tokenizer/special_tokens_map.json +33 -0
  30. tokenizer/tokenizer.json +3 -0
  31. tokenizer/tokenizer.model +3 -0
  32. tokenizer/tokenizer_config.json +0 -0
  33. transformer/config.json +33 -0
  34. transformer/diffusion_pytorch_model-00001-of-00004.safetensors +3 -0
  35. transformer/diffusion_pytorch_model-00002-of-00004.safetensors +3 -0
  36. transformer/diffusion_pytorch_model-00003-of-00004.safetensors +3 -0
  37. transformer/diffusion_pytorch_model-00004-of-00004.safetensors +3 -0
  38. transformer/diffusion_pytorch_model.safetensors.index.json +0 -0
  39. vae/config.json +81 -0
  40. vae/diffusion_pytorch_model.safetensors +3 -0
  41. vocoder/config.json +6 -0
  42. vocoder/diffusion_pytorch_model.safetensors +3 -0
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ ltx2.3-open.png filter=lfs diff=lfs merge=lfs -text
37
+ tokenizer/tokenizer.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - en
4
+ library_name: diffusers
5
+ license: other
6
+ license_name: ltx-2-community-license-agreement
7
+ license_link: https://github.com/Lightricks/LTX-2/blob/main/LICENSE
8
+ pipeline_tag: image-to-video
9
+ arxiv: 2601.03233
10
+ tags:
11
+ - image-to-video
12
+ - text-to-video
13
+ - ltx-2
14
+ - ltx-2-3
15
+ - ltx-video
16
+ - lightricks
17
+ - diffusers
18
+ base_model: Lightricks/LTX-2.3
19
+ ---
20
+
21
+ > **LTX-2.3 Distilled (22B) — Diffusers Format**
22
+ >
23
+ > Distilled version of the full model. 8 steps, CFG=1.
24
+ >
25
+ > Converted from [Lightricks/LTX-2.3](https://huggingface.co/Lightricks/LTX-2.3) raw safetensors to HuggingFace Diffusers format.
26
+
27
+ ---
28
+
29
+
30
+ # LTX-2.3 Model Card
31
+
32
+ This model card focuses on the LTX-2.3 model, which is a significant update to the [LTX-2 model](https://huggingface.co/Lightricks/LTX-2) with improved audio and visual quality as well as enhanced prompt adherence.
33
+ LTX-2 was presented in the paper [LTX-2: Efficient Joint Audio-Visual Foundation Model](https://huggingface.co/papers/2601.03233).
34
+
35
+ 💻💻 **If you want to dive in right to the code - it is available [here](https://github.com/Lightricks/LTX-2).** 💾💾
36
+
37
+ LTX-2.3 is a DiT-based audio-video foundation model designed to generate synchronized video and audio within a single model. It brings together the core building blocks of modern video generation, with open weights and a focus on practical, local execution.
38
+
39
+ [![LTX-2 Open Source](ltx2.3-open.png)](https://youtu.be/o-7us-BR_gQ)
40
+
41
+ # Model Checkpoints
42
+
43
+ | Name | Notes |
44
+ |------------------------------------|--------------------------------------------------------------------------------------------------------------------|
45
+ | ltx-2.3-22b-dev | The full model, flexible and trainable in bf16 |
46
+ | ltx-2.3-22b-distilled | The distilled version of the full model, 8 steps, CFG=1 |
47
+ | ltx-2.3-22b-distilled-lora-384 | A LoRA version of the distilled model applicable to the full model |
48
+ | ltx-2.3-spatial-upscaler-x2-1.0 | An x2 spatial upscaler for the ltx-2.3 latents, used in multi stage (multiscale) pipelines for higher resolution |
49
+ | ltx-2.3-spatial-upscaler-x1.5-1.0 | An x1.5 spatial upscaler for the ltx-2.3 latents, used in multi stage (multiscale) pipelines for higher resolution |
50
+ | ltx-2.3-temporal-upscaler-x2-1.0 | An x2 temporal upscaler for the ltx-2.3 latents, used in multi stage (multiscale) pipelines for higher FPS |
51
+
52
+ ## Model Details
53
+ - **Developed by:** Lightricks
54
+ - **Model type:** Diffusion-based audio-video foundation model
55
+ - **Language(s):** English
56
+
57
+ # Online demo
58
+ LTX-2.3 is accessible right away via the [API Playground](https://console.ltx.video/playground/).
59
+
60
+ # Run locally
61
+
62
+ ## Direct use license
63
+ You can use the models - full, distilled, upscalers and any derivatives of the models - for purposes under the [license](./LICENSE).
64
+
65
+ ## ComfyUI
66
+ We recommend you use the built-in LTXVideo nodes that can be found in the ComfyUI Manager.
67
+ For manual installation information, please refer to our [documentation site](https://docs.ltx.video/open-source-model/integration-tools/comfy-ui).
68
+
69
+ ## PyTorch codebase
70
+
71
+ The [LTX-2 codebase](https://github.com/Lightricks/LTX-2) is a monorepo with several packages. From model definition in 'ltx-core' to pipelines in 'ltx-pipelines' and training capabilities in 'ltx-trainer'.
72
+ The codebase was tested with Python >=3.12, CUDA version >12.7, and supports PyTorch ~= 2.7.
73
+
74
+ ### Installation
75
+
76
+ ```bash
77
+ git clone https://github.com/Lightricks/LTX-2.git
78
+ cd LTX-2
79
+
80
+ # From the repository root
81
+ uv sync
82
+ source .venv/bin/activate
83
+ ```
84
+
85
+ ### Inference
86
+
87
+ To use our model, please follow the instructions in our [ltx-pipelines](https://github.com/Lightricks/LTX-2/blob/main/packages/ltx-pipelines/README.md) package.
88
+
89
+ ## Diffusers 🧨
90
+
91
+ LTX-2.3 support in the [Diffusers Python library](https://huggingface.co/docs/diffusers/main/en/index) is coming soon!
92
+
93
+ ## General tips:
94
+ * Width & height settings must be divisible by 32. Frame count must be divisible by 8 + 1.
95
+ * In case the resolution or number of frames are not divisible by 32 or 8 + 1, the input should be padded with -1 and then cropped to the desired resolution and number of frames.
96
+ * For tips on writing effective prompts, please visit our [Prompting guide](https://ltx.video/blog/how-to-prompt-for-ltx-2)
97
+
98
+ ### Limitations
99
+ - This model is not intended or able to provide factual information.
100
+ - As a statistical model this checkpoint might amplify existing societal biases.
101
+ - The model may fail to generate videos that matches the prompts perfectly.
102
+ - Prompt following is heavily influenced by the prompting-style.
103
+ - The model may generate content that is inappropriate or offensive.
104
+ - When generating audio without speech, the audio may be of lower quality.
105
+
106
+ # Train the model
107
+
108
+ The base (dev) model is fully trainable.
109
+
110
+ It's extremely easy to reproduce the LoRAs and IC-LoRAs we publish with the model by following the instructions on the [LTX-2 Trainer Readme](https://github.com/Lightricks/LTX-2/blob/main/packages/ltx-trainer/README.md).
111
+
112
+ Training for motion, style or likeness (sound+appearance) can take less than an hour in many settings.
113
+
114
+ ## Citation
115
+
116
+ ```bibtex
117
+ @article{hacohen2025ltx2,
118
+ title={LTX-2: Efficient Joint Audio-Visual Foundation Model},
119
+ author={HaCohen, Yoav and Brazowski, Benny and Chiprut, Nisan and Bitterman, Yaki and Kvochko, Andrew and Berkowitz, Avishai and Shalem, Daniel and Lifschitz, Daphna and Moshe, Dudu and Porat, Eitan and Richardson, Eitan and Guy Shiran and Itay Chachy and Jonathan Chetboun and Michael Finkelson and Michael Kupchick and Nir Zabari and Nitzan Guetta and Noa Kotler and Ofir Bibi and Ori Gordon and Poriya Panet and Roi Benita and Shahar Armon and Victor Kulikov and Yaron Inger and Yonatan Shiftan and Zeev Melumian and Zeev Farbman},
120
+ journal={arXiv preprint arXiv:2601.03233},
121
+ year={2025}
122
+ }
123
+ ```
audio_vae/config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "AutoencoderKLLTX2Audio",
3
+ "_diffusers_version": "0.37.0.dev0",
4
+ "attn_resolutions": null,
5
+ "base_channels": 128,
6
+ "causality_axis": "height",
7
+ "ch_mult": [
8
+ 1,
9
+ 2,
10
+ 4
11
+ ],
12
+ "double_z": true,
13
+ "dropout": 0.0,
14
+ "in_channels": 2,
15
+ "is_causal": true,
16
+ "latent_channels": 8,
17
+ "mel_bins": 64,
18
+ "mel_hop_length": 160,
19
+ "mid_block_add_attention": false,
20
+ "norm_type": "pixel",
21
+ "num_res_blocks": 2,
22
+ "output_channels": 2,
23
+ "resolution": 256,
24
+ "sample_rate": 16000
25
+ }
audio_vae/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb0efa51a43fe510df0f3108d951a2bc31645b5936dbb6123742184cf451f992
3
+ size 106507940
connectors/config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "LTX2TextConnectors",
3
+ "_diffusers_version": "0.37.0.dev0",
4
+ "audio_connector_attention_head_dim": 64,
5
+ "audio_connector_num_attention_heads": 32,
6
+ "audio_connector_num_layers": 8,
7
+ "audio_connector_num_learnable_registers": 128,
8
+ "caption_channels": 3840,
9
+ "causal_temporal_positioning": false,
10
+ "connector_rope_base_seq_len": 4096,
11
+ "rope_double_precision": true,
12
+ "rope_theta": 10000.0,
13
+ "rope_type": "interleaved",
14
+ "text_proj_in_factor": 49,
15
+ "video_connector_attention_head_dim": 128,
16
+ "video_connector_num_attention_heads": 32,
17
+ "video_connector_num_layers": 8,
18
+ "video_connector_num_learnable_registers": 128
19
+ }
connectors/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c455a1b365b3ba2d15a9c6e068c1e209eebc3bfef87e1fe0a5a9c65d98833fc0
3
+ size 6344489088
ltx2.3-open.png ADDED

Git LFS Details

  • SHA256: ab52a78cb26a9f91051effd0f5161889669391e9588ad810a4e1bd816a6f621b
  • Pointer size: 132 Bytes
  • Size of remote file: 2.26 MB
model_index.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "LTX2Pipeline",
3
+ "_diffusers_version": "0.37.0.dev0",
4
+ "audio_vae": [
5
+ "diffusers",
6
+ "AutoencoderKLLTX2Audio"
7
+ ],
8
+ "connectors": [
9
+ "ltx2",
10
+ "LTX2TextConnectors"
11
+ ],
12
+ "scheduler": [
13
+ "diffusers",
14
+ "FlowMatchEulerDiscreteScheduler"
15
+ ],
16
+ "text_encoder": [
17
+ "transformers",
18
+ "Gemma3ForConditionalGeneration"
19
+ ],
20
+ "tokenizer": [
21
+ "transformers",
22
+ "GemmaTokenizerFast"
23
+ ],
24
+ "transformer": [
25
+ "diffusers",
26
+ "LTX2VideoTransformer3DModel"
27
+ ],
28
+ "vae": [
29
+ "diffusers",
30
+ "AutoencoderKLLTX2Video"
31
+ ],
32
+ "vocoder": [
33
+ "ltx2",
34
+ "LTX2Vocoder"
35
+ ]
36
+ }
scheduler/scheduler_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "FlowMatchEulerDiscreteScheduler",
3
+ "_diffusers_version": "0.37.0.dev0",
4
+ "base_image_seq_len": 1024,
5
+ "base_shift": 0.95,
6
+ "invert_sigmas": false,
7
+ "max_image_seq_len": 4096,
8
+ "max_shift": 2.05,
9
+ "num_train_timesteps": 1000,
10
+ "shift": 1.0,
11
+ "shift_terminal": 0.1,
12
+ "stochastic_sampling": false,
13
+ "time_shift_type": "exponential",
14
+ "use_beta_sigmas": false,
15
+ "use_dynamic_shifting": true,
16
+ "use_exponential_sigmas": false,
17
+ "use_karras_sigmas": false
18
+ }
text_encoder/config.json ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Gemma3ForConditionalGeneration"
4
+ ],
5
+ "boi_token_index": 255999,
6
+ "dtype": "float32",
7
+ "eoi_token_index": 256000,
8
+ "eos_token_id": [
9
+ 1,
10
+ 106
11
+ ],
12
+ "image_token_index": 262144,
13
+ "initializer_range": 0.02,
14
+ "mm_tokens_per_image": 256,
15
+ "model_type": "gemma3",
16
+ "text_config": {
17
+ "_sliding_window_pattern": 6,
18
+ "attention_bias": false,
19
+ "attention_dropout": 0.0,
20
+ "attn_logit_softcapping": null,
21
+ "cache_implementation": "hybrid",
22
+ "dtype": "float32",
23
+ "final_logit_softcapping": null,
24
+ "head_dim": 256,
25
+ "hidden_activation": "gelu_pytorch_tanh",
26
+ "hidden_size": 3840,
27
+ "initializer_range": 0.02,
28
+ "intermediate_size": 15360,
29
+ "layer_types": [
30
+ "sliding_attention",
31
+ "sliding_attention",
32
+ "sliding_attention",
33
+ "sliding_attention",
34
+ "sliding_attention",
35
+ "full_attention",
36
+ "sliding_attention",
37
+ "sliding_attention",
38
+ "sliding_attention",
39
+ "sliding_attention",
40
+ "sliding_attention",
41
+ "full_attention",
42
+ "sliding_attention",
43
+ "sliding_attention",
44
+ "sliding_attention",
45
+ "sliding_attention",
46
+ "sliding_attention",
47
+ "full_attention",
48
+ "sliding_attention",
49
+ "sliding_attention",
50
+ "sliding_attention",
51
+ "sliding_attention",
52
+ "sliding_attention",
53
+ "full_attention",
54
+ "sliding_attention",
55
+ "sliding_attention",
56
+ "sliding_attention",
57
+ "sliding_attention",
58
+ "sliding_attention",
59
+ "full_attention",
60
+ "sliding_attention",
61
+ "sliding_attention",
62
+ "sliding_attention",
63
+ "sliding_attention",
64
+ "sliding_attention",
65
+ "full_attention",
66
+ "sliding_attention",
67
+ "sliding_attention",
68
+ "sliding_attention",
69
+ "sliding_attention",
70
+ "sliding_attention",
71
+ "full_attention",
72
+ "sliding_attention",
73
+ "sliding_attention",
74
+ "sliding_attention",
75
+ "sliding_attention",
76
+ "sliding_attention",
77
+ "full_attention"
78
+ ],
79
+ "max_position_embeddings": 131072,
80
+ "model_type": "gemma3_text",
81
+ "num_attention_heads": 16,
82
+ "num_hidden_layers": 48,
83
+ "num_key_value_heads": 8,
84
+ "query_pre_attn_scalar": 256,
85
+ "rms_norm_eps": 1e-06,
86
+ "rope_local_base_freq": 10000,
87
+ "rope_scaling": {
88
+ "factor": 8.0,
89
+ "rope_type": "linear"
90
+ },
91
+ "rope_theta": 1000000,
92
+ "sliding_window": 1024,
93
+ "sliding_window_pattern": 6,
94
+ "use_bidirectional_attention": false,
95
+ "use_cache": true,
96
+ "vocab_size": 262208
97
+ },
98
+ "transformers_version": "4.57.3",
99
+ "vision_config": {
100
+ "attention_dropout": 0.0,
101
+ "dtype": "float32",
102
+ "hidden_act": "gelu_pytorch_tanh",
103
+ "hidden_size": 1152,
104
+ "image_size": 896,
105
+ "intermediate_size": 4304,
106
+ "layer_norm_eps": 1e-06,
107
+ "model_type": "siglip_vision_model",
108
+ "num_attention_heads": 16,
109
+ "num_channels": 3,
110
+ "num_hidden_layers": 27,
111
+ "patch_size": 14,
112
+ "vision_use_head": false
113
+ }
114
+ }
text_encoder/diffusion_pytorch_model-00001-of-00012.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:06ffef2cbc9908f6db15a735a12c412c106ff7f112b3d4da72bc98c00bc2c034
3
+ size 1685231024
text_encoder/diffusion_pytorch_model-00002-of-00012.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:308270af3b7caa5d2cd0076dff5a2dd9f0020d6628fe2d2ee04fa597cb066fbb
3
+ size 4987027560
text_encoder/diffusion_pytorch_model-00003-of-00012.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:523d1b6d3ba4b9ede7a5e6f7df7599bdb12eeab23099694293ab2bbbfa62cc6f
3
+ size 4844750680
text_encoder/diffusion_pytorch_model-00004-of-00012.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cf426cf00fe66fa5fd48d2acae77082f7f423c71c55d9c7a8da26232e852b7a0
3
+ size 4954910584
text_encoder/diffusion_pytorch_model-00005-of-00012.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:01c8cec1fc6d7024b8fcf4517b79ca0df34279e4d6767423a2229772c1a9d5e3
3
+ size 4907665448
text_encoder/diffusion_pytorch_model-00006-of-00012.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f1a5ec996bdd602cfebba1fa7f06f6942643032b353b13a0fd1a8c00382efb24
3
+ size 4954910640
text_encoder/diffusion_pytorch_model-00007-of-00012.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5594441c5b83d7404a16ebf5ec51e0947b9639b62561e2442170c0b6e0069502
3
+ size 4907665448
text_encoder/diffusion_pytorch_model-00008-of-00012.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b333d3bb47641e91e6fa2cff9580b25463a5d76a1b1a272b77d3d6c0fe78a556
3
+ size 4954910640
text_encoder/diffusion_pytorch_model-00009-of-00012.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:34db39ec863ee8c357f4247455bca8eabba9f3ccb9f838daf795db04b1919250
3
+ size 4907665448
text_encoder/diffusion_pytorch_model-00010-of-00012.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e72953188ecbdf2a526371b46f66bfd27c58d5ad622bf5c4147aeab7ddb83cb
3
+ size 4954910640
text_encoder/diffusion_pytorch_model-00011-of-00012.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:29993bd9711eba9336246990ffa2cb6ae584816cad0249b6a0bc3729d95bb869
3
+ size 4962817760
text_encoder/diffusion_pytorch_model-00012-of-00012.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19a8f0f23c87c36285a10632fabfb2c091f211244d124b9c63074debba6e6b21
3
+ size 589949224
text_encoder/diffusion_pytorch_model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
text_encoder/generation_config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cache_implementation": "hybrid",
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 1,
6
+ 106
7
+ ],
8
+ "top_k": 64,
9
+ "top_p": 0.95,
10
+ "transformers_version": "4.57.3"
11
+ }
tokenizer/added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "<image_soft_token>": 262144
3
+ }
tokenizer/chat_template.jinja ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {{ bos_token }}
2
+ {%- if messages[0]['role'] == 'system' -%}
3
+ {%- if messages[0]['content'] is string -%}
4
+ {%- set first_user_prefix = messages[0]['content'] + '
5
+
6
+ ' -%}
7
+ {%- else -%}
8
+ {%- set first_user_prefix = messages[0]['content'][0]['text'] + '
9
+
10
+ ' -%}
11
+ {%- endif -%}
12
+ {%- set loop_messages = messages[1:] -%}
13
+ {%- else -%}
14
+ {%- set first_user_prefix = "" -%}
15
+ {%- set loop_messages = messages -%}
16
+ {%- endif -%}
17
+ {%- for message in loop_messages -%}
18
+ {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
19
+ {{ raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") }}
20
+ {%- endif -%}
21
+ {%- if (message['role'] == 'assistant') -%}
22
+ {%- set role = "model" -%}
23
+ {%- else -%}
24
+ {%- set role = message['role'] -%}
25
+ {%- endif -%}
26
+ {{ '<start_of_turn>' + role + '
27
+ ' + (first_user_prefix if loop.first else "") }}
28
+ {%- if message['content'] is string -%}
29
+ {{ message['content'] | trim }}
30
+ {%- elif message['content'] is iterable -%}
31
+ {%- for item in message['content'] -%}
32
+ {%- if item['type'] == 'image' -%}
33
+ {{ '<start_of_image>' }}
34
+ {%- elif item['type'] == 'text' -%}
35
+ {{ item['text'] | trim }}
36
+ {%- endif -%}
37
+ {%- endfor -%}
38
+ {%- else -%}
39
+ {{ raise_exception("Invalid content type") }}
40
+ {%- endif -%}
41
+ {{ '<end_of_turn>
42
+ ' }}
43
+ {%- endfor -%}
44
+ {%- if add_generation_prompt -%}
45
+ {{'<start_of_turn>model
46
+ '}}
47
+ {%- endif -%}
tokenizer/preprocessor_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_convert_rgb": null,
3
+ "do_normalize": true,
4
+ "do_pan_and_scan": null,
5
+ "do_rescale": true,
6
+ "do_resize": true,
7
+ "image_mean": [
8
+ 0.5,
9
+ 0.5,
10
+ 0.5
11
+ ],
12
+ "image_processor_type": "Gemma3ImageProcessor",
13
+ "image_seq_length": 256,
14
+ "image_std": [
15
+ 0.5,
16
+ 0.5,
17
+ 0.5
18
+ ],
19
+ "pan_and_scan_max_num_crops": null,
20
+ "pan_and_scan_min_crop_size": null,
21
+ "pan_and_scan_min_ratio_to_activate": null,
22
+ "processor_class": "Gemma3Processor",
23
+ "resample": 2,
24
+ "rescale_factor": 0.00392156862745098,
25
+ "size": {
26
+ "height": 896,
27
+ "width": 896
28
+ }
29
+ }
tokenizer/processor_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "image_seq_length": 256,
3
+ "processor_class": "Gemma3Processor"
4
+ }
tokenizer/special_tokens_map.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "boi_token": "<start_of_image>",
3
+ "bos_token": {
4
+ "content": "<bos>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ "eoi_token": "<end_of_image>",
11
+ "eos_token": {
12
+ "content": "<eos>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false
17
+ },
18
+ "image_token": "<image_soft_token>",
19
+ "pad_token": {
20
+ "content": "<pad>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false
25
+ },
26
+ "unk_token": {
27
+ "content": "<unk>",
28
+ "lstrip": false,
29
+ "normalized": false,
30
+ "rstrip": false,
31
+ "single_word": false
32
+ }
33
+ }
tokenizer/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4667f2089529e8e7657cfb6d1c19910ae71ff5f28aa7ab2ff2763330affad795
3
+ size 33384568
tokenizer/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1299c11d7cf632ef3b4e11937501358ada021bbdf7c47638d13c0ee982f2e79c
3
+ size 4689074
tokenizer/tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff
 
transformer/config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "LTX2VideoTransformer3DModel",
3
+ "_diffusers_version": "0.37.0.dev0",
4
+ "activation_fn": "gelu-approximate",
5
+ "attention_bias": true,
6
+ "attention_head_dim": 128,
7
+ "attention_out_bias": true,
8
+ "audio_attention_head_dim": 64,
9
+ "audio_cross_attention_dim": 2048,
10
+ "audio_dim": 2048,
11
+ "audio_freq_embed_dim": 256,
12
+ "audio_in_channels": 64,
13
+ "audio_num_attention_heads": 32,
14
+ "audio_patch_size": 1,
15
+ "audio_patch_size_t": 2,
16
+ "caption_channels": 4096,
17
+ "caption_projection_dim": 4096,
18
+ "cross_attention_dim": 4096,
19
+ "in_channels": 128,
20
+ "norm_elementwise_affine": false,
21
+ "norm_eps": 1e-06,
22
+ "num_attention_heads": 32,
23
+ "num_layers": 48,
24
+ "out_channels": 128,
25
+ "patch_size": 1,
26
+ "patch_size_t": 1,
27
+ "qk_norm": "rms_norm_across_heads",
28
+ "rope_type": "interleaved",
29
+ "video_gated_attn": true,
30
+ "video_cross_attn_adaln": true,
31
+ "audio_gated_attn": true,
32
+ "audio_cross_attn_adaln": true
33
+ }
transformer/diffusion_pytorch_model-00001-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f95ded60a2388503610a9912339690d7a3eded74e254a91ba91d1bcce41e2eea
3
+ size 9953158120
transformer/diffusion_pytorch_model-00002-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:480a37a1df70e66b96c86184b802d1b4717af3fb95bf68bc00ec9198e0dccb59
3
+ size 9922545696
transformer/diffusion_pytorch_model-00003-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c98ed286a96e7337e109272836ca2570e16c0c968d30229cf5688b7cc8ebcd1
3
+ size 9972470088
transformer/diffusion_pytorch_model-00004-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d2d4e08a46536420aeadbdd1f99c2301ae14a1d998baf05e1fee728fb54f8bd
3
+ size 8139475264
transformer/diffusion_pytorch_model.safetensors.index.json ADDED
The diff for this file is too large to render. See raw diff
 
vae/config.json ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "AutoencoderKLLTX2Video",
3
+ "_diffusers_version": "0.37.0.dev0",
4
+ "block_out_channels": [
5
+ 256,
6
+ 512,
7
+ 1024,
8
+ 2048
9
+ ],
10
+ "decoder_block_out_channels": [
11
+ 256,
12
+ 512,
13
+ 1024
14
+ ],
15
+ "decoder_causal": false,
16
+ "decoder_inject_noise": [
17
+ false,
18
+ false,
19
+ false,
20
+ false
21
+ ],
22
+ "decoder_layers_per_block": [
23
+ 5,
24
+ 5,
25
+ 5,
26
+ 5
27
+ ],
28
+ "decoder_spatial_padding_mode": "reflect",
29
+ "decoder_spatio_temporal_scaling": [
30
+ true,
31
+ true,
32
+ true
33
+ ],
34
+ "down_block_types": [
35
+ "LTX2VideoDownBlock3D",
36
+ "LTX2VideoDownBlock3D",
37
+ "LTX2VideoDownBlock3D",
38
+ "LTX2VideoDownBlock3D"
39
+ ],
40
+ "downsample_type": [
41
+ "spatial",
42
+ "temporal",
43
+ "spatiotemporal",
44
+ "spatiotemporal"
45
+ ],
46
+ "encoder_causal": true,
47
+ "encoder_spatial_padding_mode": "zeros",
48
+ "in_channels": 3,
49
+ "latent_channels": 128,
50
+ "layers_per_block": [
51
+ 4,
52
+ 6,
53
+ 6,
54
+ 2,
55
+ 2
56
+ ],
57
+ "out_channels": 3,
58
+ "patch_size": 4,
59
+ "patch_size_t": 1,
60
+ "resnet_norm_eps": 1e-06,
61
+ "scaling_factor": 1.0,
62
+ "spatial_compression_ratio": 32,
63
+ "spatio_temporal_scaling": [
64
+ true,
65
+ true,
66
+ true,
67
+ true
68
+ ],
69
+ "temporal_compression_ratio": 8,
70
+ "timestep_conditioning": false,
71
+ "upsample_factor": [
72
+ 2,
73
+ 2,
74
+ 2
75
+ ],
76
+ "upsample_residual": [
77
+ true,
78
+ true,
79
+ true
80
+ ]
81
+ }
vae/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16c86f9a4b7dc792574241ed44ac4c20b9d95eb0abf61e5a18193af0f073e7c5
3
+ size 1452233138
vocoder/config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "LTX2Vocoder",
3
+ "_diffusers_version": "0.37.0.dev0",
4
+ "hop_length": 512,
5
+ "sample_rate": 44100
6
+ }
vocoder/diffusion_pytorch_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b64c7e94f0744ec68d04df6616bf5a8369bc20c41addb82f8ad5086fea2386f2
3
+ size 258308856