Carlos s
commited on
Upload pipeline_ltx_video.py
Browse files- pipeline_ltx_video.py +51 -3
pipeline_ltx_video.py
CHANGED
|
@@ -186,6 +186,13 @@ def retrieve_timesteps(
|
|
| 186 |
]
|
| 187 |
scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
|
| 188 |
num_inference_steps = len(timesteps)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 189 |
|
| 190 |
return timesteps, num_inference_steps
|
| 191 |
|
|
@@ -875,12 +882,18 @@ class LTXVideoPipeline(DiffusionPipeline):
|
|
| 875 |
tone_map_compression_ratio: compression ratio for tone mapping, defaults to 0.0.
|
| 876 |
If set to 0.0, no tone mapping is applied. If set to 1.0 - full compression is applied.
|
| 877 |
Examples:
|
| 878 |
-
|
| 879 |
Returns:
|
| 880 |
[`~pipelines.ImagePipelineOutput`] or `tuple`:
|
| 881 |
If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
|
| 882 |
returned where the first element is a list with the generated images
|
| 883 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 884 |
if "mask_feature" in kwargs:
|
| 885 |
deprecation_message = "The use of `mask_feature` is deprecated. It is no longer used in any computation and that doesn't affect the end results. It will be removed in a future version."
|
| 886 |
deprecate("mask_feature", "1.0.0", deprecation_message, standard_warn=False)
|
|
@@ -948,6 +961,11 @@ class LTXVideoPipeline(DiffusionPipeline):
|
|
| 948 |
skip_final_inference_steps=skip_final_inference_steps,
|
| 949 |
**retrieve_timesteps_kwargs,
|
| 950 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 951 |
|
| 952 |
if self.allowed_inference_steps is not None:
|
| 953 |
for timestep in [round(x, 4) for x in timesteps.tolist()]:
|
|
@@ -1016,7 +1034,12 @@ class LTXVideoPipeline(DiffusionPipeline):
|
|
| 1016 |
conditioning_items,
|
| 1017 |
max_new_tokens=text_encoder_max_tokens,
|
| 1018 |
)
|
| 1019 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1020 |
# 3. Encode input prompt
|
| 1021 |
if self.text_encoder is not None:
|
| 1022 |
self.text_encoder = self.text_encoder.to(self._execution_device)
|
|
@@ -1081,6 +1104,13 @@ class LTXVideoPipeline(DiffusionPipeline):
|
|
| 1081 |
generator=generator,
|
| 1082 |
vae_per_channel_normalize=vae_per_channel_normalize,
|
| 1083 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1084 |
|
| 1085 |
# Update the latents with the conditioning items and patchify them into (b, n, c)
|
| 1086 |
latents, pixel_coords, conditioning_mask, num_cond_latents = (
|
|
@@ -1096,9 +1126,20 @@ class LTXVideoPipeline(DiffusionPipeline):
|
|
| 1096 |
)
|
| 1097 |
init_latents = latents.clone() # Used for image_cond_noise_update
|
| 1098 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1099 |
# 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
|
| 1100 |
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
|
| 1101 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1102 |
# 7. Denoising loop
|
| 1103 |
num_warmup_steps = max(
|
| 1104 |
len(timesteps) - num_inference_steps * self.scheduler.order, 0
|
|
@@ -1289,7 +1330,14 @@ class LTXVideoPipeline(DiffusionPipeline):
|
|
| 1289 |
if callback_on_step_end is not None:
|
| 1290 |
callback_on_step_end(self, i, t, {})
|
| 1291 |
|
| 1292 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1293 |
self.transformer = self.transformer.cpu()
|
| 1294 |
if self._execution_device == "cuda":
|
| 1295 |
torch.cuda.empty_cache()
|
|
|
|
| 186 |
]
|
| 187 |
scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
|
| 188 |
num_inference_steps = len(timesteps)
|
| 189 |
+
|
| 190 |
+
|
| 191 |
+
print(f"[ADUC DEBUG LTX *causal_video_autoencoder.py*]=======")
|
| 192 |
+
print(f"skip_initial_inference_steps {skip_initial_inference_steps}")
|
| 193 |
+
print(f"skip_final_inference_steps {skip_final_inference_steps}")
|
| 194 |
+
print(f"timesteps {timesteps}")
|
| 195 |
+
|
| 196 |
|
| 197 |
return timesteps, num_inference_steps
|
| 198 |
|
|
|
|
| 882 |
tone_map_compression_ratio: compression ratio for tone mapping, defaults to 0.0.
|
| 883 |
If set to 0.0, no tone mapping is applied. If set to 1.0 - full compression is applied.
|
| 884 |
Examples:
|
|
|
|
| 885 |
Returns:
|
| 886 |
[`~pipelines.ImagePipelineOutput`] or `tuple`:
|
| 887 |
If `return_dict` is `True`, [`~pipelines.ImagePipelineOutput`] is returned, otherwise a `tuple` is
|
| 888 |
returned where the first element is a list with the generated images
|
| 889 |
"""
|
| 890 |
+
|
| 891 |
+
print(f"[1ADUC DEBUG LTX *causal_video_autoencoder.py*]=======")
|
| 892 |
+
print(f"skip_initial_inference_steps {skip_initial_inference_steps}")
|
| 893 |
+
print(f"skip_final_inference_steps {skip_final_inference_steps}")
|
| 894 |
+
print(f"latents {latents.shape}")
|
| 895 |
+
|
| 896 |
+
|
| 897 |
if "mask_feature" in kwargs:
|
| 898 |
deprecation_message = "The use of `mask_feature` is deprecated. It is no longer used in any computation and that doesn't affect the end results. It will be removed in a future version."
|
| 899 |
deprecate("mask_feature", "1.0.0", deprecation_message, standard_warn=False)
|
|
|
|
| 961 |
skip_final_inference_steps=skip_final_inference_steps,
|
| 962 |
**retrieve_timesteps_kwargs,
|
| 963 |
)
|
| 964 |
+
|
| 965 |
+
print(f"[2ADUC DEBUG LTX *causal_video_autoencoder.py*]=======")
|
| 966 |
+
print(f"skip_initial_inference_steps {skip_initial_inference_steps}")
|
| 967 |
+
print(f"skip_final_inference_steps {skip_final_inference_steps}")
|
| 968 |
+
print(f"latents {latents.shape}")
|
| 969 |
|
| 970 |
if self.allowed_inference_steps is not None:
|
| 971 |
for timestep in [round(x, 4) for x in timesteps.tolist()]:
|
|
|
|
| 1034 |
conditioning_items,
|
| 1035 |
max_new_tokens=text_encoder_max_tokens,
|
| 1036 |
)
|
| 1037 |
+
|
| 1038 |
+
print(f"[4ADUC DEBUG LTX *causal_video_autoencoder.py*]=======")
|
| 1039 |
+
print(f"skip_initial_inference_steps {skip_initial_inference_steps}")
|
| 1040 |
+
print(f"skip_final_inference_steps {skip_final_inference_steps}")
|
| 1041 |
+
print(f"latents {latents.shape}")
|
| 1042 |
+
|
| 1043 |
# 3. Encode input prompt
|
| 1044 |
if self.text_encoder is not None:
|
| 1045 |
self.text_encoder = self.text_encoder.to(self._execution_device)
|
|
|
|
| 1104 |
generator=generator,
|
| 1105 |
vae_per_channel_normalize=vae_per_channel_normalize,
|
| 1106 |
)
|
| 1107 |
+
|
| 1108 |
+
|
| 1109 |
+
print(f"[5ADUC DEBUG LTX *causal_video_autoencoder.py*]=======")
|
| 1110 |
+
print(f"skip_initial_inference_steps {skip_initial_inference_steps}")
|
| 1111 |
+
print(f"skip_final_inference_steps {skip_final_inference_steps}")
|
| 1112 |
+
print(f"latents {latents.shape}")
|
| 1113 |
+
|
| 1114 |
|
| 1115 |
# Update the latents with the conditioning items and patchify them into (b, n, c)
|
| 1116 |
latents, pixel_coords, conditioning_mask, num_cond_latents = (
|
|
|
|
| 1126 |
)
|
| 1127 |
init_latents = latents.clone() # Used for image_cond_noise_update
|
| 1128 |
|
| 1129 |
+
|
| 1130 |
+
|
| 1131 |
+
print(f"[6ADUC DEBUG LTX *causal_video_autoencoder.py*]=======")
|
| 1132 |
+
print(f"skip_initial_inference_steps {skip_initial_inference_steps}")
|
| 1133 |
+
print(f"skip_final_inference_steps {skip_final_inference_steps}")
|
| 1134 |
+
print(f"latents {latents.shape}")
|
| 1135 |
# 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
|
| 1136 |
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
|
| 1137 |
|
| 1138 |
+
|
| 1139 |
+
print(f"[7ADUC DEBUG LTX *causal_video_autoencoder.py*]=======")
|
| 1140 |
+
print(f"skip_initial_inference_steps {skip_initial_inference_steps}")
|
| 1141 |
+
print(f"skip_final_inference_steps {skip_final_inference_steps}")
|
| 1142 |
+
print(f"latents {latents.shape}")
|
| 1143 |
# 7. Denoising loop
|
| 1144 |
num_warmup_steps = max(
|
| 1145 |
len(timesteps) - num_inference_steps * self.scheduler.order, 0
|
|
|
|
| 1330 |
if callback_on_step_end is not None:
|
| 1331 |
callback_on_step_end(self, i, t, {})
|
| 1332 |
|
| 1333 |
+
|
| 1334 |
+
|
| 1335 |
+
print(f"[8ADUC DEBUG LTX *causal_video_autoencoder.py*]=======")
|
| 1336 |
+
print(f"skip_initial_inference_steps {skip_initial_inference_steps}")
|
| 1337 |
+
print(f"skip_final_inference_steps {skip_final_inference_steps}")
|
| 1338 |
+
print(f"latents {latents.shape}")
|
| 1339 |
+
|
| 1340 |
+
if offload_to_cpu:
|
| 1341 |
self.transformer = self.transformer.cpu()
|
| 1342 |
if self._execution_device == "cuda":
|
| 1343 |
torch.cuda.empty_cache()
|