Fabrice-TIERCELIN commited on
Commit
1419257
·
verified ·
1 Parent(s): 0f91c4b

Upload 4 files

Browse files
ltx_video/utils/diffusers_config_mapping.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def make_hashable_key(dict_key):
2
+ def convert_value(value):
3
+ if isinstance(value, list):
4
+ return tuple(value)
5
+ elif isinstance(value, dict):
6
+ return tuple(sorted((k, convert_value(v)) for k, v in value.items()))
7
+ else:
8
+ return value
9
+
10
+ return tuple(sorted((k, convert_value(v)) for k, v in dict_key.items()))
11
+
12
+
13
+ DIFFUSERS_SCHEDULER_CONFIG = {
14
+ "_class_name": "FlowMatchEulerDiscreteScheduler",
15
+ "_diffusers_version": "0.32.0.dev0",
16
+ "base_image_seq_len": 1024,
17
+ "base_shift": 0.95,
18
+ "invert_sigmas": False,
19
+ "max_image_seq_len": 4096,
20
+ "max_shift": 2.05,
21
+ "num_train_timesteps": 1000,
22
+ "shift": 1.0,
23
+ "shift_terminal": 0.1,
24
+ "use_beta_sigmas": False,
25
+ "use_dynamic_shifting": True,
26
+ "use_exponential_sigmas": False,
27
+ "use_karras_sigmas": False,
28
+ }
29
+ DIFFUSERS_TRANSFORMER_CONFIG = {
30
+ "_class_name": "LTXVideoTransformer3DModel",
31
+ "_diffusers_version": "0.32.0.dev0",
32
+ "activation_fn": "gelu-approximate",
33
+ "attention_bias": True,
34
+ "attention_head_dim": 64,
35
+ "attention_out_bias": True,
36
+ "caption_channels": 4096,
37
+ "cross_attention_dim": 2048,
38
+ "in_channels": 128,
39
+ "norm_elementwise_affine": False,
40
+ "norm_eps": 1e-06,
41
+ "num_attention_heads": 32,
42
+ "num_layers": 28,
43
+ "out_channels": 128,
44
+ "patch_size": 1,
45
+ "patch_size_t": 1,
46
+ "qk_norm": "rms_norm_across_heads",
47
+ }
48
+ DIFFUSERS_VAE_CONFIG = {
49
+ "_class_name": "AutoencoderKLLTXVideo",
50
+ "_diffusers_version": "0.32.0.dev0",
51
+ "block_out_channels": [128, 256, 512, 512],
52
+ "decoder_causal": False,
53
+ "encoder_causal": True,
54
+ "in_channels": 3,
55
+ "latent_channels": 128,
56
+ "layers_per_block": [4, 3, 3, 3, 4],
57
+ "out_channels": 3,
58
+ "patch_size": 4,
59
+ "patch_size_t": 1,
60
+ "resnet_norm_eps": 1e-06,
61
+ "scaling_factor": 1.0,
62
+ "spatio_temporal_scaling": [True, True, True, False],
63
+ }
64
+
65
+ OURS_SCHEDULER_CONFIG = {
66
+ "_class_name": "RectifiedFlowScheduler",
67
+ "_diffusers_version": "0.25.1",
68
+ "num_train_timesteps": 1000,
69
+ "shifting": "SD3",
70
+ "base_resolution": None,
71
+ "target_shift_terminal": 0.1,
72
+ }
73
+
74
+ OURS_TRANSFORMER_CONFIG = {
75
+ "_class_name": "Transformer3DModel",
76
+ "_diffusers_version": "0.25.1",
77
+ "_name_or_path": "PixArt-alpha/PixArt-XL-2-256x256",
78
+ "activation_fn": "gelu-approximate",
79
+ "attention_bias": True,
80
+ "attention_head_dim": 64,
81
+ "attention_type": "default",
82
+ "caption_channels": 4096,
83
+ "cross_attention_dim": 2048,
84
+ "double_self_attention": False,
85
+ "dropout": 0.0,
86
+ "in_channels": 128,
87
+ "norm_elementwise_affine": False,
88
+ "norm_eps": 1e-06,
89
+ "norm_num_groups": 32,
90
+ "num_attention_heads": 32,
91
+ "num_embeds_ada_norm": 1000,
92
+ "num_layers": 28,
93
+ "num_vector_embeds": None,
94
+ "only_cross_attention": False,
95
+ "out_channels": 128,
96
+ "project_to_2d_pos": True,
97
+ "upcast_attention": False,
98
+ "use_linear_projection": False,
99
+ "qk_norm": "rms_norm",
100
+ "standardization_norm": "rms_norm",
101
+ "positional_embedding_type": "rope",
102
+ "positional_embedding_theta": 10000.0,
103
+ "positional_embedding_max_pos": [20, 2048, 2048],
104
+ "timestep_scale_multiplier": 1000,
105
+ }
106
+ OURS_VAE_CONFIG = {
107
+ "_class_name": "CausalVideoAutoencoder",
108
+ "dims": 3,
109
+ "in_channels": 3,
110
+ "out_channels": 3,
111
+ "latent_channels": 128,
112
+ "blocks": [
113
+ ["res_x", 4],
114
+ ["compress_all", 1],
115
+ ["res_x_y", 1],
116
+ ["res_x", 3],
117
+ ["compress_all", 1],
118
+ ["res_x_y", 1],
119
+ ["res_x", 3],
120
+ ["compress_all", 1],
121
+ ["res_x", 3],
122
+ ["res_x", 4],
123
+ ],
124
+ "scaling_factor": 1.0,
125
+ "norm_layer": "pixel_norm",
126
+ "patch_size": 4,
127
+ "latent_log_var": "uniform",
128
+ "use_quant_conv": False,
129
+ "causal_decoder": False,
130
+ }
131
+
132
+
133
+ diffusers_and_ours_config_mapping = {
134
+ make_hashable_key(DIFFUSERS_SCHEDULER_CONFIG): OURS_SCHEDULER_CONFIG,
135
+ make_hashable_key(DIFFUSERS_TRANSFORMER_CONFIG): OURS_TRANSFORMER_CONFIG,
136
+ make_hashable_key(DIFFUSERS_VAE_CONFIG): OURS_VAE_CONFIG,
137
+ }
138
+
139
+
140
+ TRANSFORMER_KEYS_RENAME_DICT = {
141
+ "proj_in": "patchify_proj",
142
+ "time_embed": "adaln_single",
143
+ "norm_q": "q_norm",
144
+ "norm_k": "k_norm",
145
+ }
146
+
147
+
148
+ VAE_KEYS_RENAME_DICT = {
149
+ "decoder.up_blocks.3.conv_in": "decoder.up_blocks.7",
150
+ "decoder.up_blocks.3.upsamplers.0": "decoder.up_blocks.8",
151
+ "decoder.up_blocks.3": "decoder.up_blocks.9",
152
+ "decoder.up_blocks.2.upsamplers.0": "decoder.up_blocks.5",
153
+ "decoder.up_blocks.2.conv_in": "decoder.up_blocks.4",
154
+ "decoder.up_blocks.2": "decoder.up_blocks.6",
155
+ "decoder.up_blocks.1.upsamplers.0": "decoder.up_blocks.2",
156
+ "decoder.up_blocks.1": "decoder.up_blocks.3",
157
+ "decoder.up_blocks.0": "decoder.up_blocks.1",
158
+ "decoder.mid_block": "decoder.up_blocks.0",
159
+ "encoder.down_blocks.3": "encoder.down_blocks.8",
160
+ "encoder.down_blocks.2.downsamplers.0": "encoder.down_blocks.7",
161
+ "encoder.down_blocks.2": "encoder.down_blocks.6",
162
+ "encoder.down_blocks.1.downsamplers.0": "encoder.down_blocks.4",
163
+ "encoder.down_blocks.1.conv_out": "encoder.down_blocks.5",
164
+ "encoder.down_blocks.1": "encoder.down_blocks.3",
165
+ "encoder.down_blocks.0.conv_out": "encoder.down_blocks.2",
166
+ "encoder.down_blocks.0.downsamplers.0": "encoder.down_blocks.1",
167
+ "encoder.down_blocks.0": "encoder.down_blocks.0",
168
+ "encoder.mid_block": "encoder.down_blocks.9",
169
+ "conv_shortcut.conv": "conv_shortcut",
170
+ "resnets": "res_blocks",
171
+ "norm3": "norm3.norm",
172
+ "latents_mean": "per_channel_statistics.mean-of-means",
173
+ "latents_std": "per_channel_statistics.std-of-means",
174
+ }
ltx_video/utils/prompt_enhance_utils.py ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from typing import Union, List, Optional
3
+
4
+ import torch
5
+ from PIL import Image
6
+
7
+ logger = logging.getLogger(__name__) # pylint: disable=invalid-name
8
+
9
+ T2V_CINEMATIC_PROMPT = """You are an expert cinematic director with many award winning movies, When writing prompts based on the user input, focus on detailed, chronological descriptions of actions and scenes.
10
+ Include specific movements, appearances, camera angles, and environmental details - all in a single flowing paragraph.
11
+ Start directly with the action, and keep descriptions literal and precise.
12
+ Think like a cinematographer describing a shot list.
13
+ Do not change the user input intent, just enhance it.
14
+ Keep within 150 words.
15
+ For best results, build your prompts using this structure:
16
+ Start with main action in a single sentence
17
+ Add specific details about movements and gestures
18
+ Describe character/object appearances precisely
19
+ Include background and environment details
20
+ Specify camera angles and movements
21
+ Describe lighting and colors
22
+ Note any changes or sudden events
23
+ Do not exceed the 150 word limit!
24
+ Output the enhanced prompt only.
25
+ """
26
+
27
+ I2V_CINEMATIC_PROMPT = """You are an expert cinematic director with many award winning movies, When writing prompts based on the user input, focus on detailed, chronological descriptions of actions and scenes.
28
+ Include specific movements, appearances, camera angles, and environmental details - all in a single flowing paragraph.
29
+ Start directly with the action, and keep descriptions literal and precise.
30
+ Think like a cinematographer describing a shot list.
31
+ Keep within 150 words.
32
+ For best results, build your prompts using this structure:
33
+ Describe the image first and then add the user input. Image description should be in first priority! Align to the image caption if it contradicts the user text input.
34
+ Start with main action in a single sentence
35
+ Add specific details about movements and gestures
36
+ Describe character/object appearances precisely
37
+ Include background and environment details
38
+ Specify camera angles and movements
39
+ Describe lighting and colors
40
+ Note any changes or sudden events
41
+ Align to the image caption if it contradicts the user text input.
42
+ Do not exceed the 150 word limit!
43
+ Output the enhanced prompt only.
44
+ """
45
+
46
+
47
+ def tensor_to_pil(tensor):
48
+ # Ensure tensor is in range [-1, 1]
49
+ assert tensor.min() >= -1 and tensor.max() <= 1
50
+
51
+ # Convert from [-1, 1] to [0, 1]
52
+ tensor = (tensor + 1) / 2
53
+
54
+ # Rearrange from [C, H, W] to [H, W, C]
55
+ tensor = tensor.permute(1, 2, 0)
56
+
57
+ # Convert to numpy array and then to uint8 range [0, 255]
58
+ numpy_image = (tensor.cpu().numpy() * 255).astype("uint8")
59
+
60
+ # Convert to PIL Image
61
+ return Image.fromarray(numpy_image)
62
+
63
+
64
+ def generate_cinematic_prompt(
65
+ image_caption_model,
66
+ image_caption_processor,
67
+ prompt_enhancer_model,
68
+ prompt_enhancer_tokenizer,
69
+ prompt: Union[str, List[str]],
70
+ conditioning_items: Optional[List] = None,
71
+ max_new_tokens: int = 256,
72
+ ) -> List[str]:
73
+ prompts = [prompt] if isinstance(prompt, str) else prompt
74
+
75
+ if conditioning_items is None:
76
+ prompts = _generate_t2v_prompt(
77
+ prompt_enhancer_model,
78
+ prompt_enhancer_tokenizer,
79
+ prompts,
80
+ max_new_tokens,
81
+ T2V_CINEMATIC_PROMPT,
82
+ )
83
+ else:
84
+ if len(conditioning_items) > 1 or conditioning_items[0].media_frame_number != 0:
85
+ logger.warning(
86
+ "prompt enhancement does only support unconditional or first frame of conditioning items, returning original prompts"
87
+ )
88
+ return prompts
89
+
90
+ first_frame_conditioning_item = conditioning_items[0]
91
+ first_frames = _get_first_frames_from_conditioning_item(
92
+ first_frame_conditioning_item
93
+ )
94
+
95
+ assert len(first_frames) == len(
96
+ prompts
97
+ ), "Number of conditioning frames must match number of prompts"
98
+
99
+ prompts = _generate_i2v_prompt(
100
+ image_caption_model,
101
+ image_caption_processor,
102
+ prompt_enhancer_model,
103
+ prompt_enhancer_tokenizer,
104
+ prompts,
105
+ first_frames,
106
+ max_new_tokens,
107
+ I2V_CINEMATIC_PROMPT,
108
+ )
109
+
110
+ return prompts
111
+
112
+
113
+ def _get_first_frames_from_conditioning_item(conditioning_item) -> List[Image.Image]:
114
+ frames_tensor = conditioning_item.media_item
115
+ return [
116
+ tensor_to_pil(frames_tensor[i, :, 0, :, :])
117
+ for i in range(frames_tensor.shape[0])
118
+ ]
119
+
120
+
121
+ def _generate_t2v_prompt(
122
+ prompt_enhancer_model,
123
+ prompt_enhancer_tokenizer,
124
+ prompts: List[str],
125
+ max_new_tokens: int,
126
+ system_prompt: str,
127
+ ) -> List[str]:
128
+ messages = [
129
+ [
130
+ {"role": "system", "content": system_prompt},
131
+ {"role": "user", "content": f"user_prompt: {p}"},
132
+ ]
133
+ for p in prompts
134
+ ]
135
+
136
+ texts = [
137
+ prompt_enhancer_tokenizer.apply_chat_template(
138
+ m, tokenize=False, add_generation_prompt=True
139
+ )
140
+ for m in messages
141
+ ]
142
+ model_inputs = prompt_enhancer_tokenizer(texts, return_tensors="pt").to(
143
+ prompt_enhancer_model.device
144
+ )
145
+
146
+ return _generate_and_decode_prompts(
147
+ prompt_enhancer_model, prompt_enhancer_tokenizer, model_inputs, max_new_tokens
148
+ )
149
+
150
+
151
+ def _generate_i2v_prompt(
152
+ image_caption_model,
153
+ image_caption_processor,
154
+ prompt_enhancer_model,
155
+ prompt_enhancer_tokenizer,
156
+ prompts: List[str],
157
+ first_frames: List[Image.Image],
158
+ max_new_tokens: int,
159
+ system_prompt: str,
160
+ ) -> List[str]:
161
+ image_captions = _generate_image_captions(
162
+ image_caption_model, image_caption_processor, first_frames
163
+ )
164
+
165
+ messages = [
166
+ [
167
+ {"role": "system", "content": system_prompt},
168
+ {"role": "user", "content": f"user_prompt: {p}\nimage_caption: {c}"},
169
+ ]
170
+ for p, c in zip(prompts, image_captions)
171
+ ]
172
+
173
+ texts = [
174
+ prompt_enhancer_tokenizer.apply_chat_template(
175
+ m, tokenize=False, add_generation_prompt=True
176
+ )
177
+ for m in messages
178
+ ]
179
+ model_inputs = prompt_enhancer_tokenizer(texts, return_tensors="pt").to(
180
+ prompt_enhancer_model.device
181
+ )
182
+
183
+ return _generate_and_decode_prompts(
184
+ prompt_enhancer_model, prompt_enhancer_tokenizer, model_inputs, max_new_tokens
185
+ )
186
+
187
+
188
+ def _generate_image_captions(
189
+ image_caption_model,
190
+ image_caption_processor,
191
+ images: List[Image.Image],
192
+ system_prompt: str = "<DETAILED_CAPTION>",
193
+ ) -> List[str]:
194
+ image_caption_prompts = [system_prompt] * len(images)
195
+ inputs = image_caption_processor(
196
+ image_caption_prompts, images, return_tensors="pt"
197
+ ).to(image_caption_model.device)
198
+
199
+ with torch.inference_mode():
200
+ generated_ids = image_caption_model.generate(
201
+ input_ids=inputs["input_ids"],
202
+ pixel_values=inputs["pixel_values"],
203
+ max_new_tokens=1024,
204
+ do_sample=False,
205
+ num_beams=3,
206
+ )
207
+
208
+ return image_caption_processor.batch_decode(generated_ids, skip_special_tokens=True)
209
+
210
+
211
+ def _generate_and_decode_prompts(
212
+ prompt_enhancer_model, prompt_enhancer_tokenizer, model_inputs, max_new_tokens: int
213
+ ) -> List[str]:
214
+ with torch.inference_mode():
215
+ outputs = prompt_enhancer_model.generate(
216
+ **model_inputs, max_new_tokens=max_new_tokens
217
+ )
218
+ generated_ids = [
219
+ output_ids[len(input_ids) :]
220
+ for input_ids, output_ids in zip(model_inputs.input_ids, outputs)
221
+ ]
222
+ decoded_prompts = prompt_enhancer_tokenizer.batch_decode(
223
+ generated_ids, skip_special_tokens=True
224
+ )
225
+
226
+ return decoded_prompts
ltx_video/utils/skip_layer_strategy.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ from enum import Enum, auto
2
+
3
+
4
+ class SkipLayerStrategy(Enum):
5
+ AttentionSkip = auto()
6
+ AttentionValues = auto()
7
+ Residual = auto()
8
+ TransformerBlock = auto()
ltx_video/utils/torch_utils.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch import nn
3
+
4
+
5
+ def append_dims(x: torch.Tensor, target_dims: int) -> torch.Tensor:
6
+ """Appends dimensions to the end of a tensor until it has target_dims dimensions."""
7
+ dims_to_append = target_dims - x.ndim
8
+ if dims_to_append < 0:
9
+ raise ValueError(
10
+ f"input has {x.ndim} dims but target_dims is {target_dims}, which is less"
11
+ )
12
+ elif dims_to_append == 0:
13
+ return x
14
+ return x[(...,) + (None,) * dims_to_append]
15
+
16
+
17
+ class Identity(nn.Module):
18
+ """A placeholder identity operator that is argument-insensitive."""
19
+
20
+ def __init__(self, *args, **kwargs) -> None: # pylint: disable=unused-argument
21
+ super().__init__()
22
+
23
+ # pylint: disable=unused-argument
24
+ def forward(self, x: torch.Tensor, *args, **kwargs) -> torch.Tensor:
25
+ return x