Spaces:
Runtime error
Runtime error
List
Browse files
app.py
CHANGED
|
@@ -375,10 +375,10 @@ def worker(input_image, image_position, prompts, n_prompt, seed, resolution, tot
|
|
| 375 |
llama_vec, llama_attention_mask = crop_or_pad_yield_mask(llama_vec, length=512)
|
| 376 |
llama_vec_n, llama_attention_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512)
|
| 377 |
|
| 378 |
-
llama_vec = llama_vec.to(transformer.dtype)
|
| 379 |
-
llama_vec_n = llama_vec_n.to(transformer.dtype)
|
| 380 |
-
clip_l_pooler = clip_l_pooler.to(transformer.dtype)
|
| 381 |
-
clip_l_pooler_n = clip_l_pooler_n.to(transformer.dtype)
|
| 382 |
return [llama_vec, clip_l_pooler, llama_vec_n, clip_l_pooler_n, llama_attention_mask, llama_attention_mask_n]
|
| 383 |
|
| 384 |
total_latent_sections = (total_second_length * fps_number) / (latent_window_size * 4)
|
|
@@ -396,7 +396,7 @@ def worker(input_image, image_position, prompts, n_prompt, seed, resolution, tot
|
|
| 396 |
# Clean GPU
|
| 397 |
if not high_vram:
|
| 398 |
unload_complete_models(
|
| 399 |
-
text_encoder, text_encoder_2, image_encoder, vae, transformer
|
| 400 |
)
|
| 401 |
|
| 402 |
# Text encoding
|
|
@@ -461,7 +461,7 @@ def worker(input_image, image_position, prompts, n_prompt, seed, resolution, tot
|
|
| 461 |
|
| 462 |
# Dtype
|
| 463 |
|
| 464 |
-
image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
|
| 465 |
|
| 466 |
# Sampling
|
| 467 |
|
|
@@ -529,7 +529,7 @@ def worker(input_image, image_position, prompts, n_prompt, seed, resolution, tot
|
|
| 529 |
history_pixels = soft_append_bcthw(vae_decode(real_history_latents, vae).cpu(), history_pixels, overlapped_frames)
|
| 530 |
|
| 531 |
if not high_vram:
|
| 532 |
-
unload_complete_models(text_encoder, text_encoder_2, image_encoder, vae, transformer)
|
| 533 |
|
| 534 |
if enable_preview or section_index == (0 if first_section_index == (total_latent_sections - 1) else (total_latent_sections - 1)):
|
| 535 |
output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
|
|
@@ -557,12 +557,12 @@ def worker(input_image, image_position, prompts, n_prompt, seed, resolution, tot
|
|
| 557 |
|
| 558 |
if not high_vram:
|
| 559 |
unload_complete_models()
|
| 560 |
-
move_model_to_device_with_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=gpu_memory_preservation)
|
| 561 |
|
| 562 |
if use_teacache:
|
| 563 |
-
transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
|
| 564 |
else:
|
| 565 |
-
transformer.initialize_teacache(enable_teacache=False)
|
| 566 |
|
| 567 |
if forward:
|
| 568 |
clean_latents_4x, clean_latents_2x, clean_latents_1x = history_latents[:, :, -(16 + 2 + 1):, :, :].split([16, 2, 1], dim=2)
|
|
@@ -572,7 +572,7 @@ def worker(input_image, image_position, prompts, n_prompt, seed, resolution, tot
|
|
| 572 |
clean_latents = torch.cat([clean_latents_1x, start_latent], dim=2)
|
| 573 |
|
| 574 |
generated_latents = sample_hunyuan(
|
| 575 |
-
transformer=transformer,
|
| 576 |
sampler='unipc',
|
| 577 |
width=width,
|
| 578 |
height=height,
|
|
@@ -602,7 +602,7 @@ def worker(input_image, image_position, prompts, n_prompt, seed, resolution, tot
|
|
| 602 |
callback=callback,
|
| 603 |
)
|
| 604 |
|
| 605 |
-
[total_generated_latent_frames, history_latents, history_pixels] = post_process(forward, generated_latents, total_generated_latent_frames, history_latents, high_vram, transformer, gpu, vae, history_pixels, latent_window_size, enable_preview, section_index, total_latent_sections, outputs_folder, mp4_crf, stream)
|
| 606 |
|
| 607 |
if not forward:
|
| 608 |
if section_index > 0:
|
|
@@ -626,7 +626,7 @@ def worker(input_image, image_position, prompts, n_prompt, seed, resolution, tot
|
|
| 626 |
|
| 627 |
if not high_vram:
|
| 628 |
unload_complete_models(
|
| 629 |
-
text_encoder, text_encoder_2, image_encoder, vae, transformer
|
| 630 |
)
|
| 631 |
|
| 632 |
stream.output_queue.push(('end', None))
|
|
@@ -681,10 +681,10 @@ def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_
|
|
| 681 |
llama_vec, llama_attention_mask = crop_or_pad_yield_mask(llama_vec, length=512)
|
| 682 |
llama_vec_n, llama_attention_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512)
|
| 683 |
|
| 684 |
-
llama_vec = llama_vec.to(transformer.dtype)
|
| 685 |
-
llama_vec_n = llama_vec_n.to(transformer.dtype)
|
| 686 |
-
clip_l_pooler = clip_l_pooler.to(transformer.dtype)
|
| 687 |
-
clip_l_pooler_n = clip_l_pooler_n.to(transformer.dtype)
|
| 688 |
return [llama_vec, clip_l_pooler, llama_vec_n, clip_l_pooler_n, llama_attention_mask, llama_attention_mask_n]
|
| 689 |
|
| 690 |
stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
|
|
@@ -704,7 +704,7 @@ def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_
|
|
| 704 |
# Clean GPU
|
| 705 |
if not high_vram:
|
| 706 |
unload_complete_models(
|
| 707 |
-
text_encoder, text_encoder_2, image_encoder, vae, transformer
|
| 708 |
)
|
| 709 |
|
| 710 |
# Text encoding
|
|
@@ -740,7 +740,7 @@ def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_
|
|
| 740 |
image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
|
| 741 |
|
| 742 |
# Dtype
|
| 743 |
-
image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
|
| 744 |
|
| 745 |
if enable_preview:
|
| 746 |
def callback(d):
|
|
@@ -852,17 +852,17 @@ def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_
|
|
| 852 |
|
| 853 |
if not high_vram:
|
| 854 |
unload_complete_models()
|
| 855 |
-
move_model_to_device_with_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=gpu_memory_preservation)
|
| 856 |
|
| 857 |
if use_teacache:
|
| 858 |
-
transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
|
| 859 |
else:
|
| 860 |
-
transformer.initialize_teacache(enable_teacache=False)
|
| 861 |
|
| 862 |
[max_frames, clean_latents, clean_latents_2x, clean_latents_4x, latent_indices, clean_latents, clean_latent_indices, clean_latent_2x_indices, clean_latent_4x_indices] = compute_latent(history_latents, latent_window_size, num_clean_frames, start_latent)
|
| 863 |
|
| 864 |
generated_latents = sample_hunyuan(
|
| 865 |
-
transformer=transformer,
|
| 866 |
sampler='unipc',
|
| 867 |
width=width,
|
| 868 |
height=height,
|
|
@@ -895,7 +895,7 @@ def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_
|
|
| 895 |
history_latents = torch.cat([history_latents, generated_latents.to(history_latents)], dim=2)
|
| 896 |
|
| 897 |
if not high_vram:
|
| 898 |
-
offload_model_from_device_for_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=8)
|
| 899 |
load_model_as_complete(vae, target_device=gpu)
|
| 900 |
|
| 901 |
if history_pixels is None:
|
|
@@ -909,7 +909,7 @@ def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_
|
|
| 909 |
history_pixels = soft_append_bcthw(history_pixels, vae_decode(real_history_latents, vae).cpu(), overlapped_frames)
|
| 910 |
|
| 911 |
if not high_vram:
|
| 912 |
-
unload_complete_models(text_encoder, text_encoder_2, image_encoder, vae, transformer)
|
| 913 |
|
| 914 |
if enable_preview or section_index == total_latent_sections - 1:
|
| 915 |
output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
|
|
@@ -941,7 +941,7 @@ def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_
|
|
| 941 |
|
| 942 |
if not high_vram:
|
| 943 |
unload_complete_models(
|
| 944 |
-
text_encoder, text_encoder_2, image_encoder, vae, transformer
|
| 945 |
)
|
| 946 |
|
| 947 |
stream.output_queue.push(('end', None))
|
|
@@ -1182,7 +1182,7 @@ def process_video(input_video, prompt, n_prompt, randomize_seed, seed, auto_allo
|
|
| 1182 |
high_vram = False
|
| 1183 |
vae.enable_slicing()
|
| 1184 |
vae.enable_tiling()
|
| 1185 |
-
DynamicSwapInstaller.install_model(transformer, device=gpu)
|
| 1186 |
DynamicSwapInstaller.install_model(text_encoder, device=gpu)
|
| 1187 |
|
| 1188 |
# 20250508 pftq: automatically set distilled cfg to 1 if cfg is used
|
|
|
|
| 375 |
llama_vec, llama_attention_mask = crop_or_pad_yield_mask(llama_vec, length=512)
|
| 376 |
llama_vec_n, llama_attention_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512)
|
| 377 |
|
| 378 |
+
llama_vec = llama_vec.to(transformer[0].dtype)
|
| 379 |
+
llama_vec_n = llama_vec_n.to(transformer[0].dtype)
|
| 380 |
+
clip_l_pooler = clip_l_pooler.to(transformer[0].dtype)
|
| 381 |
+
clip_l_pooler_n = clip_l_pooler_n.to(transformer[0].dtype)
|
| 382 |
return [llama_vec, clip_l_pooler, llama_vec_n, clip_l_pooler_n, llama_attention_mask, llama_attention_mask_n]
|
| 383 |
|
| 384 |
total_latent_sections = (total_second_length * fps_number) / (latent_window_size * 4)
|
|
|
|
| 396 |
# Clean GPU
|
| 397 |
if not high_vram:
|
| 398 |
unload_complete_models(
|
| 399 |
+
text_encoder, text_encoder_2, image_encoder, vae, transformer[0]
|
| 400 |
)
|
| 401 |
|
| 402 |
# Text encoding
|
|
|
|
| 461 |
|
| 462 |
# Dtype
|
| 463 |
|
| 464 |
+
image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer[0].dtype)
|
| 465 |
|
| 466 |
# Sampling
|
| 467 |
|
|
|
|
| 529 |
history_pixels = soft_append_bcthw(vae_decode(real_history_latents, vae).cpu(), history_pixels, overlapped_frames)
|
| 530 |
|
| 531 |
if not high_vram:
|
| 532 |
+
unload_complete_models(text_encoder, text_encoder_2, image_encoder, vae, transformer[0])
|
| 533 |
|
| 534 |
if enable_preview or section_index == (0 if first_section_index == (total_latent_sections - 1) else (total_latent_sections - 1)):
|
| 535 |
output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
|
|
|
|
| 557 |
|
| 558 |
if not high_vram:
|
| 559 |
unload_complete_models()
|
| 560 |
+
move_model_to_device_with_memory_preservation(transformer[0], target_device=gpu, preserved_memory_gb=gpu_memory_preservation)
|
| 561 |
|
| 562 |
if use_teacache:
|
| 563 |
+
transformer[0].initialize_teacache(enable_teacache=True, num_steps=steps)
|
| 564 |
else:
|
| 565 |
+
transformer[0].initialize_teacache(enable_teacache=False)
|
| 566 |
|
| 567 |
if forward:
|
| 568 |
clean_latents_4x, clean_latents_2x, clean_latents_1x = history_latents[:, :, -(16 + 2 + 1):, :, :].split([16, 2, 1], dim=2)
|
|
|
|
| 572 |
clean_latents = torch.cat([clean_latents_1x, start_latent], dim=2)
|
| 573 |
|
| 574 |
generated_latents = sample_hunyuan(
|
| 575 |
+
transformer=transformer[0],
|
| 576 |
sampler='unipc',
|
| 577 |
width=width,
|
| 578 |
height=height,
|
|
|
|
| 602 |
callback=callback,
|
| 603 |
)
|
| 604 |
|
| 605 |
+
[total_generated_latent_frames, history_latents, history_pixels] = post_process(forward, generated_latents, total_generated_latent_frames, history_latents, high_vram, transformer[0], gpu, vae, history_pixels, latent_window_size, enable_preview, section_index, total_latent_sections, outputs_folder, mp4_crf, stream)
|
| 606 |
|
| 607 |
if not forward:
|
| 608 |
if section_index > 0:
|
|
|
|
| 626 |
|
| 627 |
if not high_vram:
|
| 628 |
unload_complete_models(
|
| 629 |
+
text_encoder, text_encoder_2, image_encoder, vae, transformer[0]
|
| 630 |
)
|
| 631 |
|
| 632 |
stream.output_queue.push(('end', None))
|
|
|
|
| 681 |
llama_vec, llama_attention_mask = crop_or_pad_yield_mask(llama_vec, length=512)
|
| 682 |
llama_vec_n, llama_attention_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512)
|
| 683 |
|
| 684 |
+
llama_vec = llama_vec.to(transformer[0].dtype)
|
| 685 |
+
llama_vec_n = llama_vec_n.to(transformer[0].dtype)
|
| 686 |
+
clip_l_pooler = clip_l_pooler.to(transformer[0].dtype)
|
| 687 |
+
clip_l_pooler_n = clip_l_pooler_n.to(transformer[0].dtype)
|
| 688 |
return [llama_vec, clip_l_pooler, llama_vec_n, clip_l_pooler_n, llama_attention_mask, llama_attention_mask_n]
|
| 689 |
|
| 690 |
stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
|
|
|
|
| 704 |
# Clean GPU
|
| 705 |
if not high_vram:
|
| 706 |
unload_complete_models(
|
| 707 |
+
text_encoder, text_encoder_2, image_encoder, vae, transformer[0]
|
| 708 |
)
|
| 709 |
|
| 710 |
# Text encoding
|
|
|
|
| 740 |
image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
|
| 741 |
|
| 742 |
# Dtype
|
| 743 |
+
image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer[0].dtype)
|
| 744 |
|
| 745 |
if enable_preview:
|
| 746 |
def callback(d):
|
|
|
|
| 852 |
|
| 853 |
if not high_vram:
|
| 854 |
unload_complete_models()
|
| 855 |
+
move_model_to_device_with_memory_preservation(transformer[0], target_device=gpu, preserved_memory_gb=gpu_memory_preservation)
|
| 856 |
|
| 857 |
if use_teacache:
|
| 858 |
+
transformer[0].initialize_teacache(enable_teacache=True, num_steps=steps)
|
| 859 |
else:
|
| 860 |
+
transformer[0].initialize_teacache(enable_teacache=False)
|
| 861 |
|
| 862 |
[max_frames, clean_latents, clean_latents_2x, clean_latents_4x, latent_indices, clean_latents, clean_latent_indices, clean_latent_2x_indices, clean_latent_4x_indices] = compute_latent(history_latents, latent_window_size, num_clean_frames, start_latent)
|
| 863 |
|
| 864 |
generated_latents = sample_hunyuan(
|
| 865 |
+
transformer=transformer[0],
|
| 866 |
sampler='unipc',
|
| 867 |
width=width,
|
| 868 |
height=height,
|
|
|
|
| 895 |
history_latents = torch.cat([history_latents, generated_latents.to(history_latents)], dim=2)
|
| 896 |
|
| 897 |
if not high_vram:
|
| 898 |
+
offload_model_from_device_for_memory_preservation(transformer[0], target_device=gpu, preserved_memory_gb=8)
|
| 899 |
load_model_as_complete(vae, target_device=gpu)
|
| 900 |
|
| 901 |
if history_pixels is None:
|
|
|
|
| 909 |
history_pixels = soft_append_bcthw(history_pixels, vae_decode(real_history_latents, vae).cpu(), overlapped_frames)
|
| 910 |
|
| 911 |
if not high_vram:
|
| 912 |
+
unload_complete_models(text_encoder, text_encoder_2, image_encoder, vae, transformer[0])
|
| 913 |
|
| 914 |
if enable_preview or section_index == total_latent_sections - 1:
|
| 915 |
output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
|
|
|
|
| 941 |
|
| 942 |
if not high_vram:
|
| 943 |
unload_complete_models(
|
| 944 |
+
text_encoder, text_encoder_2, image_encoder, vae, transformer[0]
|
| 945 |
)
|
| 946 |
|
| 947 |
stream.output_queue.push(('end', None))
|
|
|
|
| 1182 |
high_vram = False
|
| 1183 |
vae.enable_slicing()
|
| 1184 |
vae.enable_tiling()
|
| 1185 |
+
DynamicSwapInstaller.install_model(transformer[0], device=gpu)
|
| 1186 |
DynamicSwapInstaller.install_model(text_encoder, device=gpu)
|
| 1187 |
|
| 1188 |
# 20250508 pftq: automatically set distilled cfg to 1 if cfg is used
|