Batch mode
Browse files
app.py
CHANGED
|
@@ -41,7 +41,7 @@ from PIL import Image
|
|
| 41 |
from diffusers import AutoencoderKLHunyuanVideo
|
| 42 |
from transformers import LlamaModel, CLIPTextModel, LlamaTokenizerFast, CLIPTokenizer
|
| 43 |
from diffusers_helper.hunyuan import encode_prompt_conds, vae_decode, vae_encode, vae_decode_fake
|
| 44 |
-
from diffusers_helper.utils import save_bcthw_as_mp4, crop_or_pad_yield_mask, soft_append_bcthw, resize_and_center_crop, state_dict_weighted_merge, state_dict_offset_merge
|
| 45 |
from diffusers_helper.models.hunyuan_video_packed import HunyuanVideoTransformer3DModelPacked
|
| 46 |
from diffusers_helper.pipelines.k_diffusion_hunyuan import sample_hunyuan
|
| 47 |
if torch.cuda.device_count() > 0:
|
|
@@ -368,7 +368,7 @@ def image_encode(image_np, target_width, target_height, vae, image_encoder, feat
|
|
| 368 |
raise
|
| 369 |
|
| 370 |
@torch.no_grad()
|
| 371 |
-
def worker(input_image, end_image, image_position, end_stillness, prompts, n_prompt, seed, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number):
|
| 372 |
def encode_prompt(prompt, n_prompt):
|
| 373 |
llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
|
| 374 |
|
|
@@ -393,8 +393,6 @@ def worker(input_image, end_image, image_position, end_stillness, prompts, n_pro
|
|
| 393 |
section_index = first_section_index
|
| 394 |
forward = (image_position == 0)
|
| 395 |
|
| 396 |
-
job_id = generate_timestamp()
|
| 397 |
-
|
| 398 |
stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
|
| 399 |
|
| 400 |
try:
|
|
@@ -470,172 +468,179 @@ def worker(input_image, end_image, image_position, end_stillness, prompts, n_pro
|
|
| 470 |
|
| 471 |
image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
|
| 472 |
|
| 473 |
-
|
| 474 |
-
|
| 475 |
-
|
| 476 |
-
|
| 477 |
-
rnd = torch.Generator("cpu").manual_seed(seed)
|
| 478 |
-
|
| 479 |
-
history_latents = torch.zeros(size=(1, 16, 16 + 2 + 1, height // 8, width // 8), dtype=torch.float32, device=cpu)
|
| 480 |
-
start_latent = start_latent.to(history_latents)
|
| 481 |
-
history_pixels = None
|
| 482 |
-
|
| 483 |
-
history_latents = torch.cat([history_latents, start_latent] if forward else [start_latent, history_latents], dim=2)
|
| 484 |
-
total_generated_latent_frames = 1
|
| 485 |
-
|
| 486 |
-
if enable_preview:
|
| 487 |
-
def callback(d):
|
| 488 |
-
preview = d['denoised']
|
| 489 |
-
preview = vae_decode_fake(preview)
|
| 490 |
-
|
| 491 |
-
preview = (preview * 255.0).detach().cpu().numpy().clip(0, 255).astype(np.uint8)
|
| 492 |
-
preview = einops.rearrange(preview, 'b c t h w -> (b h) (t w) c')
|
| 493 |
-
|
| 494 |
-
if stream.input_queue.top() == 'end':
|
| 495 |
-
stream.output_queue.push(('end', None))
|
| 496 |
-
raise KeyboardInterrupt('User ends the task.')
|
| 497 |
-
|
| 498 |
-
current_step = d['i'] + 1
|
| 499 |
-
percentage = int(100.0 * current_step / steps)
|
| 500 |
-
hint = f'Sampling {current_step}/{steps}'
|
| 501 |
-
desc = f'Total generated frames: {int(max(0, total_generated_latent_frames * 4 - 3))}, Video length: {max(0, (total_generated_latent_frames * 4 - 3) / fps_number) :.2f} seconds (FPS-30), Resolution: {height}px * {width}px. The video is being extended now ...'
|
| 502 |
-
stream.output_queue.push(('progress', (preview, desc, make_progress_bar_html(percentage, hint))))
|
| 503 |
-
return
|
| 504 |
-
else:
|
| 505 |
-
def callback(d):
|
| 506 |
-
return
|
| 507 |
-
|
| 508 |
-
indices = torch.arange(0, 1 + 16 + 2 + 1 + latent_window_size).unsqueeze(0)
|
| 509 |
-
if forward:
|
| 510 |
-
clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices = indices.split([1, 16, 2, 1, latent_window_size], dim=1)
|
| 511 |
-
clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
|
| 512 |
-
else:
|
| 513 |
-
latent_indices, clean_latent_1x_indices, clean_latent_2x_indices, clean_latent_4x_indices, clean_latent_indices_start = indices.split([latent_window_size, 1, 2, 16, 1], dim=1)
|
| 514 |
-
clean_latent_indices = torch.cat([clean_latent_1x_indices, clean_latent_indices_start], dim=1)
|
| 515 |
-
|
| 516 |
-
def post_process(forward, generated_latents, total_generated_latent_frames, history_latents, high_vram, transformer, gpu, vae, history_pixels, latent_window_size, enable_preview, section_index, total_latent_sections, outputs_folder, mp4_crf, stream):
|
| 517 |
-
total_generated_latent_frames += int(generated_latents.shape[2])
|
| 518 |
-
history_latents = torch.cat([history_latents, generated_latents.to(history_latents)] if forward else [generated_latents.to(history_latents), history_latents], dim=2)
|
| 519 |
-
|
| 520 |
-
if not high_vram:
|
| 521 |
-
offload_model_from_device_for_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=8)
|
| 522 |
-
load_model_as_complete(vae, target_device=gpu)
|
| 523 |
-
|
| 524 |
-
if history_pixels is None:
|
| 525 |
-
real_history_latents = history_latents[:, :, -total_generated_latent_frames:, :, :] if forward else history_latents[:, :, :total_generated_latent_frames, :, :]
|
| 526 |
-
history_pixels = vae_decode(real_history_latents, vae).cpu()
|
| 527 |
-
else:
|
| 528 |
-
section_latent_frames = latent_window_size * 2
|
| 529 |
-
overlapped_frames = latent_window_size * 4 - 3
|
| 530 |
-
|
| 531 |
-
if forward:
|
| 532 |
-
real_history_latents = history_latents[:, :, -min(section_latent_frames, total_generated_latent_frames):, :, :]
|
| 533 |
-
history_pixels = soft_append_bcthw(history_pixels, vae_decode(real_history_latents, vae).cpu(), overlapped_frames)
|
| 534 |
-
else:
|
| 535 |
-
real_history_latents = history_latents[:, :, :min(section_latent_frames, total_generated_latent_frames), :, :]
|
| 536 |
-
history_pixels = soft_append_bcthw(vae_decode(real_history_latents, vae).cpu(), history_pixels, overlapped_frames)
|
| 537 |
-
|
| 538 |
-
if not high_vram:
|
| 539 |
-
unload_complete_models(text_encoder, text_encoder_2, image_encoder, vae, transformer)
|
| 540 |
-
|
| 541 |
-
if enable_preview or section_index == (0 if first_section_index == (total_latent_sections - 1) else (total_latent_sections - 1)):
|
| 542 |
-
output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
|
| 543 |
-
|
| 544 |
-
save_bcthw_as_mp4(history_pixels, output_filename, fps=fps_number, crf=mp4_crf)
|
| 545 |
-
|
| 546 |
-
print(f'Decoded. Current latent shape pixel shape {history_pixels.shape}')
|
| 547 |
-
|
| 548 |
-
stream.output_queue.push(('file', output_filename))
|
| 549 |
-
return [total_generated_latent_frames, history_latents, history_pixels]
|
| 550 |
-
|
| 551 |
-
while section_index < total_latent_sections:
|
| 552 |
-
if stream.input_queue.top() == 'end':
|
| 553 |
-
stream.output_queue.push(('end', None))
|
| 554 |
-
return
|
| 555 |
-
|
| 556 |
-
print(f'section_index = {section_index}, total_latent_sections = {total_latent_sections}')
|
| 557 |
-
|
| 558 |
-
prompt_index = min(section_index, len(prompt_parameters) - 1)
|
| 559 |
|
| 560 |
-
|
| 561 |
|
| 562 |
-
|
| 563 |
-
|
| 564 |
-
|
| 565 |
-
|
| 566 |
-
|
| 567 |
-
|
| 568 |
-
|
| 569 |
-
|
| 570 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 571 |
else:
|
| 572 |
-
|
| 573 |
-
|
|
|
|
|
|
|
| 574 |
if forward:
|
| 575 |
-
|
| 576 |
-
|
| 577 |
else:
|
| 578 |
-
|
| 579 |
-
|
| 580 |
-
|
| 581 |
-
generated_latents
|
| 582 |
-
|
| 583 |
-
|
| 584 |
-
|
| 585 |
-
|
| 586 |
-
|
| 587 |
-
|
| 588 |
-
|
| 589 |
-
|
| 590 |
-
|
| 591 |
-
|
| 592 |
-
generator=rnd,
|
| 593 |
-
prompt_embeds=llama_vec,
|
| 594 |
-
prompt_embeds_mask=llama_attention_mask,
|
| 595 |
-
prompt_poolers=clip_l_pooler,
|
| 596 |
-
negative_prompt_embeds=llama_vec_n,
|
| 597 |
-
negative_prompt_embeds_mask=llama_attention_mask_n,
|
| 598 |
-
negative_prompt_poolers=clip_l_pooler_n,
|
| 599 |
-
device=gpu,
|
| 600 |
-
dtype=torch.bfloat16,
|
| 601 |
-
image_embeddings=image_encoder_last_hidden_state,
|
| 602 |
-
latent_indices=latent_indices,
|
| 603 |
-
clean_latents=clean_latents,
|
| 604 |
-
clean_latent_indices=clean_latent_indices,
|
| 605 |
-
clean_latents_2x=clean_latents_2x,
|
| 606 |
-
clean_latent_2x_indices=clean_latent_2x_indices,
|
| 607 |
-
clean_latents_4x=clean_latents_4x,
|
| 608 |
-
clean_latent_4x_indices=clean_latent_4x_indices,
|
| 609 |
-
callback=callback,
|
| 610 |
-
)
|
| 611 |
-
del clean_latents
|
| 612 |
-
del clean_latents_2x
|
| 613 |
-
del clean_latents_4x
|
| 614 |
-
del latent_indices
|
| 615 |
-
del clean_latent_indices
|
| 616 |
-
del clean_latent_2x_indices
|
| 617 |
-
del clean_latent_4x_indices
|
| 618 |
-
|
| 619 |
-
[total_generated_latent_frames, history_latents, history_pixels] = post_process(forward, generated_latents, total_generated_latent_frames, history_latents, high_vram, transformer, gpu, vae, history_pixels, latent_window_size, enable_preview, section_index, total_latent_sections, outputs_folder, mp4_crf, stream)
|
| 620 |
-
|
| 621 |
-
if not forward:
|
| 622 |
-
if section_index > 0:
|
| 623 |
-
section_index -= 1
|
| 624 |
else:
|
| 625 |
-
|
| 626 |
-
|
| 627 |
-
|
| 628 |
-
real_history_latents = history_latents[:, :, :total_generated_latent_frames, :, :]
|
| 629 |
-
zero_latents = history_latents[:, :, total_generated_latent_frames:, :, :]
|
| 630 |
-
history_latents = torch.cat([zero_latents, real_history_latents], dim=2)
|
| 631 |
-
del real_history_latents
|
| 632 |
-
del zero_latents
|
| 633 |
-
|
| 634 |
-
forward = True
|
| 635 |
-
section_index = first_section_index
|
| 636 |
|
| 637 |
-
|
| 638 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 639 |
except:
|
| 640 |
traceback.print_exc()
|
| 641 |
|
|
@@ -648,7 +653,7 @@ def worker(input_image, end_image, image_position, end_stillness, prompts, n_pro
|
|
| 648 |
return
|
| 649 |
|
| 650 |
@torch.no_grad()
|
| 651 |
-
def worker_start_end(input_image, end_image, image_position, end_stillness, prompts, n_prompt, seed, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number):
|
| 652 |
def encode_prompt(prompt, n_prompt):
|
| 653 |
llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
|
| 654 |
|
|
@@ -668,8 +673,7 @@ def worker_start_end(input_image, end_image, image_position, end_stillness, prom
|
|
| 668 |
|
| 669 |
total_latent_sections = (total_second_length * fps_number) / (latent_window_size * 4)
|
| 670 |
total_latent_sections = int(max(round(total_latent_sections), 1))
|
| 671 |
-
|
| 672 |
-
job_id = generate_timestamp()
|
| 673 |
|
| 674 |
stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
|
| 675 |
|
|
@@ -729,9 +733,11 @@ def worker_start_end(input_image, end_image, image_position, end_stillness, prom
|
|
| 729 |
load_model_as_complete(vae, target_device=gpu)
|
| 730 |
|
| 731 |
start_latent = vae_encode(input_image_pt, vae)
|
|
|
|
| 732 |
|
| 733 |
if has_end_image:
|
| 734 |
end_latent = vae_encode(end_image_pt, vae)
|
|
|
|
| 735 |
|
| 736 |
# CLIP Vision
|
| 737 |
stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'CLIP Vision encoding ...'))))
|
|
@@ -740,6 +746,7 @@ def worker_start_end(input_image, end_image, image_position, end_stillness, prom
|
|
| 740 |
load_model_as_complete(image_encoder, target_device=gpu)
|
| 741 |
|
| 742 |
image_encoder_output = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder)
|
|
|
|
| 743 |
image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
|
| 744 |
|
| 745 |
if has_end_image:
|
|
@@ -763,163 +770,171 @@ def worker_start_end(input_image, end_image, image_position, end_stillness, prom
|
|
| 763 |
# Dtype
|
| 764 |
image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
|
| 765 |
|
| 766 |
-
|
| 767 |
-
|
| 768 |
-
|
| 769 |
-
rnd = torch.Generator("cpu").manual_seed(seed)
|
| 770 |
-
num_frames = latent_window_size * 4 - 3
|
| 771 |
|
| 772 |
-
|
| 773 |
-
start_latent = start_latent.to(history_latents)
|
| 774 |
-
if has_end_image:
|
| 775 |
-
end_latent = end_latent.to(history_latents)
|
| 776 |
|
| 777 |
-
|
| 778 |
-
|
| 779 |
-
|
| 780 |
-
if total_latent_sections > 4:
|
| 781 |
-
# In theory the latent_paddings should follow the else sequence, but it seems that duplicating some
|
| 782 |
-
# items looks better than expanding it when total_latent_sections > 4
|
| 783 |
-
# One can try to remove below trick and just
|
| 784 |
-
# use `latent_paddings = list(reversed(range(total_latent_sections)))` to compare
|
| 785 |
-
latent_paddings = [3] + [2] * (total_latent_sections - 3) + [1, 0]
|
| 786 |
-
else:
|
| 787 |
-
# Convert an iterator to a list
|
| 788 |
-
latent_paddings = list(range(total_latent_sections - 1, -1, -1))
|
| 789 |
-
|
| 790 |
-
if enable_preview:
|
| 791 |
-
def callback(d):
|
| 792 |
-
preview = d['denoised']
|
| 793 |
-
preview = vae_decode_fake(preview)
|
| 794 |
-
|
| 795 |
-
preview = (preview * 255.0).detach().cpu().numpy().clip(0, 255).astype(np.uint8)
|
| 796 |
-
preview = einops.rearrange(preview, 'b c t h w -> (b h) (t w) c')
|
| 797 |
-
|
| 798 |
-
if stream.input_queue.top() == 'end':
|
| 799 |
-
stream.output_queue.push(('end', None))
|
| 800 |
-
raise KeyboardInterrupt('User ends the task.')
|
| 801 |
-
|
| 802 |
-
current_step = d['i'] + 1
|
| 803 |
-
percentage = int(100.0 * current_step / steps)
|
| 804 |
-
hint = f'Sampling {current_step}/{steps}'
|
| 805 |
-
desc = f'Total generated frames: {int(max(0, total_generated_latent_frames * 4 - 3))}, Video length: {max(0, (total_generated_latent_frames * 4 - 3) / fps_number) :.2f} seconds (FPS-30), Resolution: {height}px * {width}px. The video is being extended now ...'
|
| 806 |
-
stream.output_queue.push(('progress', (preview, desc, make_progress_bar_html(percentage, hint))))
|
| 807 |
-
return
|
| 808 |
-
else:
|
| 809 |
-
def callback(d):
|
| 810 |
-
return
|
| 811 |
-
|
| 812 |
-
def post_process(job_id, start_latent, generated_latents, total_generated_latent_frames, history_latents, high_vram, transformer, gpu, vae, history_pixels, latent_window_size, enable_preview, outputs_folder, mp4_crf, stream, is_last_section):
|
| 813 |
-
if is_last_section:
|
| 814 |
-
generated_latents = torch.cat([start_latent.to(generated_latents), generated_latents], dim=2)
|
| 815 |
|
| 816 |
-
|
| 817 |
-
|
| 818 |
|
| 819 |
-
|
| 820 |
-
|
| 821 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 822 |
|
| 823 |
-
if
|
| 824 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 825 |
else:
|
| 826 |
-
|
| 827 |
-
|
| 828 |
|
| 829 |
-
|
| 830 |
-
|
| 831 |
-
|
| 832 |
-
|
| 833 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 834 |
|
| 835 |
-
|
| 836 |
-
|
|
|
|
| 837 |
|
| 838 |
-
|
|
|
|
| 839 |
|
| 840 |
-
|
|
|
|
|
|
|
| 841 |
|
| 842 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 843 |
|
| 844 |
-
|
| 845 |
-
|
| 846 |
-
|
| 847 |
-
|
| 848 |
-
|
| 849 |
-
|
| 850 |
-
|
| 851 |
-
|
| 852 |
-
|
| 853 |
-
|
| 854 |
-
|
| 855 |
-
|
| 856 |
-
|
| 857 |
-
|
| 858 |
-
|
| 859 |
-
|
| 860 |
-
indices = torch.arange(1 + latent_padding_size + latent_window_size + 1 + (end_stillness if is_first_section else 0) + 2 + 16).unsqueeze(0)
|
| 861 |
-
clean_latent_indices_pre, blank_indices, latent_indices, clean_latent_indices_post, clean_latent_2x_indices, clean_latent_4x_indices = indices.split([1, latent_padding_size, latent_window_size, 1 + (end_stillness if is_first_section else 0), 2, 16], dim=1)
|
| 862 |
-
clean_latent_indices = torch.cat([clean_latent_indices_pre, clean_latent_indices_post], dim=1)
|
| 863 |
|
| 864 |
-
|
| 865 |
-
|
| 866 |
-
|
| 867 |
-
|
| 868 |
-
|
| 869 |
-
|
| 870 |
-
|
| 871 |
-
|
| 872 |
-
|
| 873 |
-
|
| 874 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 875 |
|
| 876 |
-
|
| 877 |
-
transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
|
| 878 |
-
else:
|
| 879 |
-
transformer.initialize_teacache(enable_teacache=False)
|
| 880 |
-
|
| 881 |
-
generated_latents = sample_hunyuan(
|
| 882 |
-
transformer=transformer,
|
| 883 |
-
sampler='unipc',
|
| 884 |
-
width=width,
|
| 885 |
-
height=height,
|
| 886 |
-
frames=num_frames,
|
| 887 |
-
real_guidance_scale=cfg,
|
| 888 |
-
distilled_guidance_scale=gs,
|
| 889 |
-
guidance_rescale=rs,
|
| 890 |
-
# shift=3.0,
|
| 891 |
-
num_inference_steps=steps,
|
| 892 |
-
generator=rnd,
|
| 893 |
-
prompt_embeds=llama_vec,
|
| 894 |
-
prompt_embeds_mask=llama_attention_mask,
|
| 895 |
-
prompt_poolers=clip_l_pooler,
|
| 896 |
-
negative_prompt_embeds=llama_vec_n,
|
| 897 |
-
negative_prompt_embeds_mask=llama_attention_mask_n,
|
| 898 |
-
negative_prompt_poolers=clip_l_pooler_n,
|
| 899 |
-
device=gpu,
|
| 900 |
-
dtype=torch.bfloat16,
|
| 901 |
-
image_embeddings=image_encoder_last_hidden_state,
|
| 902 |
-
latent_indices=latent_indices,
|
| 903 |
-
clean_latents=clean_latents,
|
| 904 |
-
clean_latent_indices=clean_latent_indices,
|
| 905 |
-
clean_latents_2x=clean_latents_2x,
|
| 906 |
-
clean_latent_2x_indices=clean_latent_2x_indices,
|
| 907 |
-
clean_latents_4x=clean_latents_4x,
|
| 908 |
-
clean_latent_4x_indices=clean_latent_4x_indices,
|
| 909 |
-
callback=callback,
|
| 910 |
-
)
|
| 911 |
-
del clean_latents
|
| 912 |
-
del clean_latents_2x
|
| 913 |
-
del clean_latents_4x
|
| 914 |
-
del latent_indices
|
| 915 |
-
del clean_latent_indices
|
| 916 |
-
del clean_latent_2x_indices
|
| 917 |
-
del clean_latent_4x_indices
|
| 918 |
-
|
| 919 |
-
[total_generated_latent_frames, history_latents, history_pixels] = post_process(job_id, start_latent, generated_latents, total_generated_latent_frames, history_latents, high_vram, transformer, gpu, vae, history_pixels, latent_window_size, enable_preview, outputs_folder, mp4_crf, stream, is_last_section)
|
| 920 |
-
|
| 921 |
-
if is_last_section:
|
| 922 |
-
break
|
| 923 |
except:
|
| 924 |
traceback.print_exc()
|
| 925 |
|
|
@@ -1116,7 +1131,6 @@ def worker_video(input_video, end_frame, end_stillness, prompts, n_prompt, seed,
|
|
| 1116 |
if batch > 1:
|
| 1117 |
print(f"Beginning video {idx+1} of {batch} with seed {seed} ")
|
| 1118 |
|
| 1119 |
-
#job_id = generate_timestamp()
|
| 1120 |
job_id = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")+f"_framepackf1-videoinput_{width}-{total_second_length}sec_seed-{seed}_steps-{steps}_distilled-{gs}_cfg-{cfg}" # 20250506 pftq: easier to read timestamp and filename
|
| 1121 |
|
| 1122 |
# Sampling
|
|
@@ -1132,7 +1146,7 @@ def worker_video(input_video, end_frame, end_stillness, prompts, n_prompt, seed,
|
|
| 1132 |
|
| 1133 |
# 20250509 Generate backwards with end frame for better end frame anchoring
|
| 1134 |
if total_latent_sections > 4:
|
| 1135 |
-
latent_paddings = [3] + [
|
| 1136 |
else:
|
| 1137 |
latent_paddings = list(reversed(range(total_latent_sections)))
|
| 1138 |
|
|
@@ -1253,30 +1267,33 @@ def worker_video(input_video, end_frame, end_stillness, prompts, n_prompt, seed,
|
|
| 1253 |
stream.output_queue.push(('end', None))
|
| 1254 |
return
|
| 1255 |
|
| 1256 |
-
def get_duration(input_image, end_image, image_position, end_stillness, prompts, generation_mode, n_prompt, seed, resolution, total_second_length, allocation_time, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number):
|
| 1257 |
return allocation_time
|
| 1258 |
|
| 1259 |
@spaces.GPU(duration=get_duration)
|
| 1260 |
-
def process_on_gpu(input_image, end_image, image_position, end_stillness, prompts, generation_mode, n_prompt, seed, resolution, total_second_length, allocation_time, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number
|
| 1261 |
):
|
| 1262 |
start = time.time()
|
| 1263 |
global stream
|
| 1264 |
stream = AsyncStream()
|
| 1265 |
|
| 1266 |
-
async_run(worker_start_end if generation_mode == "start_end" else worker, input_image, end_image, image_position, end_stillness, prompts, n_prompt, seed, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number)
|
| 1267 |
|
| 1268 |
output_filename = None
|
|
|
|
| 1269 |
|
| 1270 |
while True:
|
| 1271 |
flag, data = stream.output_queue.next()
|
| 1272 |
|
| 1273 |
if flag == 'file':
|
| 1274 |
output_filename = data
|
| 1275 |
-
|
|
|
|
|
|
|
| 1276 |
|
| 1277 |
if flag == 'progress':
|
| 1278 |
preview, desc, html = data
|
| 1279 |
-
yield gr.update(label="Previewed Frames"), gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True), gr.skip()
|
| 1280 |
|
| 1281 |
if flag == 'end':
|
| 1282 |
end = time.time()
|
|
@@ -1285,7 +1302,7 @@ def process_on_gpu(input_image, end_image, image_position, end_stillness, prompt
|
|
| 1285 |
secondes = secondes - (minutes * 60)
|
| 1286 |
hours = math.floor(minutes / 60)
|
| 1287 |
minutes = minutes - (hours * 60)
|
| 1288 |
-
yield gr.update(value=output_filename, label="Finished Frames"), gr.update(visible=False), gr.skip(), "The process has lasted " + \
|
| 1289 |
((str(hours) + " h, ") if hours != 0 else "") + \
|
| 1290 |
((str(minutes) + " min, ") if hours != 0 or minutes != 0 else "") + \
|
| 1291 |
str(secondes) + " sec. " + \
|
|
@@ -1303,6 +1320,7 @@ def process(input_image,
|
|
| 1303 |
seed=31337,
|
| 1304 |
auto_allocation=True,
|
| 1305 |
allocation_time=180,
|
|
|
|
| 1306 |
resolution=640,
|
| 1307 |
total_second_length=5,
|
| 1308 |
latent_window_size=9,
|
|
@@ -1321,7 +1339,7 @@ def process(input_image,
|
|
| 1321 |
|
| 1322 |
if torch.cuda.device_count() == 0:
|
| 1323 |
gr.Warning('Set this space to GPU config to make it work.')
|
| 1324 |
-
yield gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.update(visible = False)
|
| 1325 |
return
|
| 1326 |
|
| 1327 |
if randomize_seed:
|
|
@@ -1336,7 +1354,7 @@ def process(input_image,
|
|
| 1336 |
assert input_image is not None, 'No input image!'
|
| 1337 |
assert (generation_mode != "start_end") or end_image is not None, 'No end image!'
|
| 1338 |
|
| 1339 |
-
yield gr.update(label="Previewed Frames"), None, '', '', gr.update(interactive=False), gr.update(interactive=True), gr.skip()
|
| 1340 |
|
| 1341 |
gc.collect()
|
| 1342 |
yield from process_on_gpu(input_image,
|
|
@@ -1347,6 +1365,7 @@ def process(input_image,
|
|
| 1347 |
generation_mode,
|
| 1348 |
n_prompt,
|
| 1349 |
seed,
|
|
|
|
| 1350 |
resolution,
|
| 1351 |
total_second_length,
|
| 1352 |
allocation_time,
|
|
@@ -1375,17 +1394,20 @@ def process_video_on_gpu(input_video, end_frame, end_stillness, prompts, n_promp
|
|
| 1375 |
async_run(worker_video, input_video, end_frame, end_stillness, prompts, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch)
|
| 1376 |
|
| 1377 |
output_filename = None
|
|
|
|
| 1378 |
|
| 1379 |
while True:
|
| 1380 |
flag, data = stream.output_queue.next()
|
| 1381 |
|
| 1382 |
if flag == 'file':
|
| 1383 |
output_filename = data
|
| 1384 |
-
|
|
|
|
|
|
|
| 1385 |
|
| 1386 |
if flag == 'progress':
|
| 1387 |
preview, desc, html = data
|
| 1388 |
-
yield gr.update(label="Previewed Frames"), gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True), gr.skip() # 20250506 pftq: Keep refreshing the video in case it got hidden when the tab was in the background
|
| 1389 |
|
| 1390 |
if flag == 'end':
|
| 1391 |
end = time.time()
|
|
@@ -1394,7 +1416,7 @@ def process_video_on_gpu(input_video, end_frame, end_stillness, prompts, n_promp
|
|
| 1394 |
secondes = secondes - (minutes * 60)
|
| 1395 |
hours = math.floor(minutes / 60)
|
| 1396 |
minutes = minutes - (hours * 60)
|
| 1397 |
-
yield gr.update(value=output_filename, label="Finished Frames"), gr.update(visible=False), desc + \
|
| 1398 |
" The process has lasted " + \
|
| 1399 |
((str(hours) + " h, ") if hours != 0 else "") + \
|
| 1400 |
((str(minutes) + " min, ") if hours != 0 or minutes != 0 else "") + \
|
|
@@ -1409,7 +1431,7 @@ def process_video(input_video, end_frame, end_stillness, prompt, n_prompt, rando
|
|
| 1409 |
|
| 1410 |
if torch.cuda.device_count() == 0:
|
| 1411 |
gr.Warning('Set this space to GPU config to make it work.')
|
| 1412 |
-
yield gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.update(visible = False)
|
| 1413 |
return
|
| 1414 |
|
| 1415 |
if randomize_seed:
|
|
@@ -1420,7 +1442,7 @@ def process_video(input_video, end_frame, end_stillness, prompt, n_prompt, rando
|
|
| 1420 |
# 20250506 pftq: Updated assertion for video input
|
| 1421 |
assert input_video is not None, 'No input video!'
|
| 1422 |
|
| 1423 |
-
yield gr.update(label="Previewed Frames"), None, '', '', gr.update(interactive=False), gr.update(interactive=True), gr.skip()
|
| 1424 |
|
| 1425 |
# 20250507 pftq: Even the H100 needs offloading if the video dimensions are 720p or higher
|
| 1426 |
if high_vram and (no_resize or resolution>640):
|
|
@@ -1535,7 +1557,7 @@ with block:
|
|
| 1535 |
enable_preview = gr.Checkbox(label='Enable preview', value=True, info='Display a preview around each second generated but it costs 2 sec. for each second generated.')
|
| 1536 |
use_teacache = gr.Checkbox(label='Use TeaCache', value=False, info='Faster speed and no break in brightness, but often makes hands and fingers slightly worse.')
|
| 1537 |
|
| 1538 |
-
n_prompt = gr.Textbox(label="Negative Prompt", value="Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth", info='Requires using normal CFG (undistilled) instead of Distilled (set Distilled=1 and CFG > 1).')
|
| 1539 |
|
| 1540 |
fps_number = gr.Slider(label="Frame per seconds", info="The model is trained for 30 fps so other fps may generate weird results", minimum=10, maximum=60, value=30, step=1)
|
| 1541 |
end_stillness = gr.Slider(label="End stillness", minimum=0, maximum=100, value=0, step=1, info='0=Realistic end; >0=Matches exactly the end image (but the time seems to freeze)')
|
|
@@ -1548,7 +1570,7 @@ with block:
|
|
| 1548 |
resolution = gr.Dropdown([
|
| 1549 |
["409,600 px (working)", 640],
|
| 1550 |
["451,584 px (working)", 672],
|
| 1551 |
-
["495,616 px (
|
| 1552 |
["589,824 px (not tested)", 768],
|
| 1553 |
["692,224 px (not tested)", 832],
|
| 1554 |
["746,496 px (not tested)", 864],
|
|
@@ -1576,7 +1598,7 @@ with block:
|
|
| 1576 |
gpu_memory_preservation = gr.Slider(label="GPU Inference Preserved Memory (GB) (larger means slower)", minimum=6, maximum=128, value=6, step=0.1, info="Set this number to a larger value if you encounter OOM. Larger value causes slower speed.")
|
| 1577 |
|
| 1578 |
mp4_crf = gr.Slider(label="MP4 Compression", minimum=0, maximum=100, value=16, step=1, info="Lower means better quality. 0 is uncompressed. Change to 16 if you get black outputs. ")
|
| 1579 |
-
batch = gr.Slider(label="Batch Size (
|
| 1580 |
with gr.Row():
|
| 1581 |
randomize_seed = gr.Checkbox(label='Randomize seed', value=True, info='If checked, the seed is always different')
|
| 1582 |
seed = gr.Slider(label="Seed", minimum=0, maximum=np.iinfo(np.int32).max, step=1, randomize=True)
|
|
@@ -1586,12 +1608,21 @@ with block:
|
|
| 1586 |
|
| 1587 |
with gr.Column():
|
| 1588 |
warning = gr.HTML(elem_id="warning", value = "<center><big>Your computer must <u>not</u> enter into standby mode.</big><br/>On Chrome, you can force to keep a tab alive in <code>chrome://discards/</code></center>", visible = False)
|
| 1589 |
-
result_video = gr.Video(label="Generated Frames", autoplay=True, show_share_button=False, height=512, loop=True)
|
| 1590 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1591 |
progress_desc = gr.Markdown('', elem_classes='no-generating-animation')
|
| 1592 |
progress_bar = gr.HTML('', elem_classes='no-generating-animation')
|
| 1593 |
|
| 1594 |
-
ips = [input_image, end_image, image_position, end_stillness, final_prompt, generation_mode, n_prompt, randomize_seed, seed, auto_allocation, allocation_time, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number]
|
| 1595 |
ips_video = [input_video, end_image, end_stillness, final_prompt, n_prompt, randomize_seed, seed, auto_allocation, allocation_time, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch]
|
| 1596 |
|
| 1597 |
gr.Examples(
|
|
@@ -1604,11 +1635,12 @@ with block:
|
|
| 1604 |
1, # end_stillness
|
| 1605 |
"Overcrowed street in Japan, photorealistic, realistic, intricate details, 8k, insanely detailed",
|
| 1606 |
"text", # generation_mode
|
| 1607 |
-
"Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth", # n_prompt
|
| 1608 |
True, # randomize_seed
|
| 1609 |
42, # seed
|
| 1610 |
True, # auto_allocation
|
| 1611 |
180, # allocation_time
|
|
|
|
| 1612 |
672, # resolution
|
| 1613 |
1, # total_second_length
|
| 1614 |
9, # latent_window_size
|
|
@@ -1626,7 +1658,7 @@ with block:
|
|
| 1626 |
run_on_click = True,
|
| 1627 |
fn = process,
|
| 1628 |
inputs = ips,
|
| 1629 |
-
outputs = [result_video, preview_image, progress_desc, progress_bar, start_button, end_button, warning],
|
| 1630 |
cache_examples = False,
|
| 1631 |
)
|
| 1632 |
|
|
@@ -1640,11 +1672,12 @@ with block:
|
|
| 1640 |
1, # end_stillness
|
| 1641 |
"A dolphin emerges from the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
|
| 1642 |
"image", # generation_mode
|
| 1643 |
-
"Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth", # n_prompt
|
| 1644 |
True, # randomize_seed
|
| 1645 |
42, # seed
|
| 1646 |
True, # auto_allocation
|
| 1647 |
180, # allocation_time
|
|
|
|
| 1648 |
672, # resolution
|
| 1649 |
1, # total_second_length
|
| 1650 |
9, # latent_window_size
|
|
@@ -1665,11 +1698,12 @@ with block:
|
|
| 1665 |
1, # end_stillness
|
| 1666 |
"A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The man talks and the woman listens; A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks, the man stops talking and the man listens; A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks and the man listens",
|
| 1667 |
"image", # generation_mode
|
| 1668 |
-
"Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth", # n_prompt
|
| 1669 |
True, # randomize_seed
|
| 1670 |
42, # seed
|
| 1671 |
True, # auto_allocation
|
| 1672 |
180, # allocation_time
|
|
|
|
| 1673 |
672, # resolution
|
| 1674 |
2, # total_second_length
|
| 1675 |
9, # latent_window_size
|
|
@@ -1690,11 +1724,12 @@ with block:
|
|
| 1690 |
1, # end_stillness
|
| 1691 |
"A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks and the man listens; A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The man talks, the woman stops talking and the woman listens A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The man talks and the woman listens",
|
| 1692 |
"image", # generation_mode
|
| 1693 |
-
"Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth", # n_prompt
|
| 1694 |
True, # randomize_seed
|
| 1695 |
42, # seed
|
| 1696 |
True, # auto_allocation
|
| 1697 |
180, # allocation_time
|
|
|
|
| 1698 |
672, # resolution
|
| 1699 |
2, # total_second_length
|
| 1700 |
9, # latent_window_size
|
|
@@ -1720,6 +1755,7 @@ with block:
|
|
| 1720 |
42, # seed
|
| 1721 |
True, # auto_allocation
|
| 1722 |
180, # allocation_time
|
|
|
|
| 1723 |
672, # resolution
|
| 1724 |
1, # total_second_length
|
| 1725 |
9, # latent_window_size
|
|
@@ -1740,11 +1776,12 @@ with block:
|
|
| 1740 |
1, # end_stillness
|
| 1741 |
"A building starting to explode, photorealistic, realisitc, 8k, insanely detailed",
|
| 1742 |
"image", # generation_mode
|
| 1743 |
-
"Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth", # n_prompt
|
| 1744 |
True, # randomize_seed
|
| 1745 |
42, # seed
|
| 1746 |
True, # auto_allocation
|
| 1747 |
180, # allocation_time
|
|
|
|
| 1748 |
672, # resolution
|
| 1749 |
1, # total_second_length
|
| 1750 |
9, # latent_window_size
|
|
@@ -1762,7 +1799,7 @@ with block:
|
|
| 1762 |
run_on_click = True,
|
| 1763 |
fn = process,
|
| 1764 |
inputs = ips,
|
| 1765 |
-
outputs = [result_video, preview_image, progress_desc, progress_bar, start_button, end_button, warning],
|
| 1766 |
cache_examples = False,
|
| 1767 |
)
|
| 1768 |
|
|
@@ -1776,11 +1813,12 @@ with block:
|
|
| 1776 |
0, # end_stillness
|
| 1777 |
"A woman jumps out of the train and arrives on the ground, viewed from the outside, photorealistic, realistic, amateur photography, midday, insanely detailed, 8k", # prompt
|
| 1778 |
"start_end", # generation_mode
|
| 1779 |
-
"Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth, jumpcut, crossfader, crossfading", # n_prompt
|
| 1780 |
True, # randomize_seed
|
| 1781 |
42, # seed
|
| 1782 |
True, # auto_allocation
|
| 1783 |
180, # allocation_time
|
|
|
|
| 1784 |
672, # resolution
|
| 1785 |
1, # total_second_length
|
| 1786 |
9, # latent_window_size
|
|
@@ -1798,7 +1836,7 @@ with block:
|
|
| 1798 |
run_on_click = True,
|
| 1799 |
fn = process,
|
| 1800 |
inputs = ips,
|
| 1801 |
-
outputs = [result_video, preview_image, progress_desc, progress_bar, start_button, end_button, warning],
|
| 1802 |
cache_examples = False,
|
| 1803 |
)
|
| 1804 |
|
|
@@ -1810,7 +1848,7 @@ with block:
|
|
| 1810 |
None, # end_image
|
| 1811 |
1, # end_stillness
|
| 1812 |
"View of the sea as far as the eye can see, from the seaside, a piece of land is barely visible on the horizon at the middle, the sky is radiant, reflections of the sun in the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
|
| 1813 |
-
"Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth, jumpcut, crossfader, crossfading", # n_prompt
|
| 1814 |
True, # randomize_seed
|
| 1815 |
42, # seed
|
| 1816 |
True, # auto_allocation
|
|
@@ -1836,7 +1874,7 @@ with block:
|
|
| 1836 |
"./img_examples/Example1.png", # end_image
|
| 1837 |
1, # end_stillness
|
| 1838 |
"View of the sea as far as the eye can see, from the seaside, a piece of land is barely visible on the horizon at the middle, the sky is radiant, reflections of the sun in the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
|
| 1839 |
-
"Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth, jumpcut, crossfader, crossfading", # n_prompt
|
| 1840 |
True, # randomize_seed
|
| 1841 |
42, # seed
|
| 1842 |
True, # auto_allocation
|
|
@@ -1861,7 +1899,7 @@ with block:
|
|
| 1861 |
run_on_click = True,
|
| 1862 |
fn = process_video,
|
| 1863 |
inputs = ips_video,
|
| 1864 |
-
outputs = [result_video, preview_image, progress_desc, progress_bar, start_button_video, end_button, warning],
|
| 1865 |
cache_examples = False,
|
| 1866 |
)
|
| 1867 |
|
|
@@ -1899,7 +1937,6 @@ with block:
|
|
| 1899 |
gr.update(visible = True), # start_button
|
| 1900 |
gr.update(visible = False), # start_button_video
|
| 1901 |
gr.update(visible = False), # no_resize
|
| 1902 |
-
gr.update(visible = False), # batch
|
| 1903 |
gr.update(visible = False), # num_clean_frames
|
| 1904 |
gr.update(visible = False), # vae_batch
|
| 1905 |
gr.update(visible = False), # prompt_hint
|
|
@@ -1916,7 +1953,6 @@ with block:
|
|
| 1916 |
gr.update(visible = True), # start_button
|
| 1917 |
gr.update(visible = False), # start_button_video
|
| 1918 |
gr.update(visible = False), # no_resize
|
| 1919 |
-
gr.update(visible = False), # batch
|
| 1920 |
gr.update(visible = False), # num_clean_frames
|
| 1921 |
gr.update(visible = False), # vae_batch
|
| 1922 |
gr.update(visible = False), # prompt_hint
|
|
@@ -1933,7 +1969,6 @@ with block:
|
|
| 1933 |
gr.update(visible = True), # start_button
|
| 1934 |
gr.update(visible = False), # start_button_video
|
| 1935 |
gr.update(visible = False), # no_resize
|
| 1936 |
-
gr.update(visible = False), # batch
|
| 1937 |
gr.update(visible = False), # num_clean_frames
|
| 1938 |
gr.update(visible = False), # vae_batch
|
| 1939 |
gr.update(visible = False), # prompt_hint
|
|
@@ -1950,7 +1985,6 @@ with block:
|
|
| 1950 |
gr.update(visible = False), # start_button
|
| 1951 |
gr.update(visible = True), # start_button_video
|
| 1952 |
gr.update(visible = True), # no_resize
|
| 1953 |
-
gr.update(visible = True), # batch
|
| 1954 |
gr.update(visible = True), # num_clean_frames
|
| 1955 |
gr.update(visible = True), # vae_batch
|
| 1956 |
gr.update(visible = True), # prompt_hint
|
|
@@ -1961,10 +1995,10 @@ with block:
|
|
| 1961 |
timeless_prompt.change(fn=handle_timeless_prompt_change, inputs=[timeless_prompt], outputs=[final_prompt])
|
| 1962 |
start_button.click(fn = check_parameters, inputs = [
|
| 1963 |
generation_mode, input_image, input_video
|
| 1964 |
-
], outputs = [end_button, warning], queue = False, show_progress = False).success(fn=process, inputs=ips, outputs=[result_video, preview_image, progress_desc, progress_bar, start_button, end_button, warning], scroll_to_output = True)
|
| 1965 |
start_button_video.click(fn = check_parameters, inputs = [
|
| 1966 |
generation_mode, input_image, input_video
|
| 1967 |
-
], outputs = [end_button, warning], queue = False, show_progress = False).success(fn=process_video, inputs=ips_video, outputs=[result_video, preview_image, progress_desc, progress_bar, start_button_video, end_button, warning], scroll_to_output = True)
|
| 1968 |
end_button.click(fn=end_process)
|
| 1969 |
|
| 1970 |
generation_mode.change(fn = save_preferences, inputs = [
|
|
@@ -1977,7 +2011,7 @@ with block:
|
|
| 1977 |
generation_mode.change(
|
| 1978 |
fn=handle_generation_mode_change,
|
| 1979 |
inputs=[generation_mode],
|
| 1980 |
-
outputs=[text_to_video_hint, image_position, input_image, end_image, end_stillness, input_video, start_button, start_button_video, no_resize,
|
| 1981 |
)
|
| 1982 |
|
| 1983 |
# Update display when the page loads
|
|
@@ -1985,7 +2019,7 @@ with block:
|
|
| 1985 |
fn=handle_generation_mode_change, inputs = [
|
| 1986 |
generation_mode
|
| 1987 |
], outputs = [
|
| 1988 |
-
text_to_video_hint, image_position, input_image, end_image, end_stillness, input_video, start_button, start_button_video, no_resize,
|
| 1989 |
]
|
| 1990 |
)
|
| 1991 |
|
|
|
|
| 41 |
from diffusers import AutoencoderKLHunyuanVideo
|
| 42 |
from transformers import LlamaModel, CLIPTextModel, LlamaTokenizerFast, CLIPTokenizer
|
| 43 |
from diffusers_helper.hunyuan import encode_prompt_conds, vae_decode, vae_encode, vae_decode_fake
|
| 44 |
+
from diffusers_helper.utils import save_bcthw_as_mp4, crop_or_pad_yield_mask, soft_append_bcthw, resize_and_center_crop, state_dict_weighted_merge, state_dict_offset_merge
|
| 45 |
from diffusers_helper.models.hunyuan_video_packed import HunyuanVideoTransformer3DModelPacked
|
| 46 |
from diffusers_helper.pipelines.k_diffusion_hunyuan import sample_hunyuan
|
| 47 |
if torch.cuda.device_count() > 0:
|
|
|
|
| 368 |
raise
|
| 369 |
|
| 370 |
@torch.no_grad()
|
| 371 |
+
def worker(input_image, end_image, image_position, end_stillness, prompts, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number):
|
| 372 |
def encode_prompt(prompt, n_prompt):
|
| 373 |
llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
|
| 374 |
|
|
|
|
| 393 |
section_index = first_section_index
|
| 394 |
forward = (image_position == 0)
|
| 395 |
|
|
|
|
|
|
|
| 396 |
stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
|
| 397 |
|
| 398 |
try:
|
|
|
|
| 468 |
|
| 469 |
image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
|
| 470 |
|
| 471 |
+
for idx in range(batch):
|
| 472 |
+
if batch > 1:
|
| 473 |
+
print(f"Beginning video {idx+1} of {batch} with seed {seed} ")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 474 |
|
| 475 |
+
job_id = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")+f"_framepackf1_{width}-{total_second_length}sec_seed-{seed}_steps-{steps}_distilled-{gs}_cfg-{cfg}"
|
| 476 |
|
| 477 |
+
# Sampling
|
| 478 |
+
stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Start sampling ...'))))
|
| 479 |
+
|
| 480 |
+
rnd = torch.Generator("cpu").manual_seed(seed)
|
| 481 |
+
|
| 482 |
+
history_latents = torch.zeros(size=(1, 16, 16 + 2 + 1, height // 8, width // 8), dtype=torch.float32, device=cpu)
|
| 483 |
+
start_latent = start_latent.to(history_latents)
|
| 484 |
+
history_pixels = None
|
| 485 |
+
|
| 486 |
+
history_latents = torch.cat([history_latents, start_latent] if forward else [start_latent, history_latents], dim=2)
|
| 487 |
+
total_generated_latent_frames = 1
|
| 488 |
+
|
| 489 |
+
if enable_preview:
|
| 490 |
+
def callback(d):
|
| 491 |
+
preview = d['denoised']
|
| 492 |
+
preview = vae_decode_fake(preview)
|
| 493 |
+
|
| 494 |
+
preview = (preview * 255.0).detach().cpu().numpy().clip(0, 255).astype(np.uint8)
|
| 495 |
+
preview = einops.rearrange(preview, 'b c t h w -> (b h) (t w) c')
|
| 496 |
+
|
| 497 |
+
if stream.input_queue.top() == 'end':
|
| 498 |
+
stream.output_queue.push(('end', None))
|
| 499 |
+
raise KeyboardInterrupt('User ends the task.')
|
| 500 |
+
|
| 501 |
+
current_step = d['i'] + 1
|
| 502 |
+
percentage = int(100.0 * current_step / steps)
|
| 503 |
+
hint = f'Sampling {current_step}/{steps}'
|
| 504 |
+
desc = f'Total generated frames: {int(max(0, total_generated_latent_frames * 4 - 3))}, Video length: {max(0, (total_generated_latent_frames * 4 - 3) / fps_number) :.2f} seconds (FPS-30), Resolution: {height}px * {width}px, Video {idx+1} of {batch}. The video is being extended now ...'
|
| 505 |
+
stream.output_queue.push(('progress', (preview, desc, make_progress_bar_html(percentage, hint))))
|
| 506 |
+
return
|
| 507 |
else:
|
| 508 |
+
def callback(d):
|
| 509 |
+
return
|
| 510 |
+
|
| 511 |
+
indices = torch.arange(0, 1 + 16 + 2 + 1 + latent_window_size).unsqueeze(0)
|
| 512 |
if forward:
|
| 513 |
+
clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices = indices.split([1, 16, 2, 1, latent_window_size], dim=1)
|
| 514 |
+
clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
|
| 515 |
else:
|
| 516 |
+
latent_indices, clean_latent_1x_indices, clean_latent_2x_indices, clean_latent_4x_indices, clean_latent_indices_start = indices.split([latent_window_size, 1, 2, 16, 1], dim=1)
|
| 517 |
+
clean_latent_indices = torch.cat([clean_latent_1x_indices, clean_latent_indices_start], dim=1)
|
| 518 |
+
|
| 519 |
+
def post_process(forward, generated_latents, total_generated_latent_frames, history_latents, high_vram, transformer, gpu, vae, history_pixels, latent_window_size, enable_preview, section_index, total_latent_sections, outputs_folder, mp4_crf, stream):
|
| 520 |
+
total_generated_latent_frames += int(generated_latents.shape[2])
|
| 521 |
+
history_latents = torch.cat([history_latents, generated_latents.to(history_latents)] if forward else [generated_latents.to(history_latents), history_latents], dim=2)
|
| 522 |
+
|
| 523 |
+
if not high_vram:
|
| 524 |
+
offload_model_from_device_for_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=8)
|
| 525 |
+
load_model_as_complete(vae, target_device=gpu)
|
| 526 |
+
|
| 527 |
+
if history_pixels is None:
|
| 528 |
+
real_history_latents = history_latents[:, :, -total_generated_latent_frames:, :, :] if forward else history_latents[:, :, :total_generated_latent_frames, :, :]
|
| 529 |
+
history_pixels = vae_decode(real_history_latents, vae).cpu()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 530 |
else:
|
| 531 |
+
section_latent_frames = latent_window_size * 2
|
| 532 |
+
overlapped_frames = latent_window_size * 4 - 3
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 533 |
|
| 534 |
+
if forward:
|
| 535 |
+
real_history_latents = history_latents[:, :, -min(section_latent_frames, total_generated_latent_frames):, :, :]
|
| 536 |
+
history_pixels = soft_append_bcthw(history_pixels, vae_decode(real_history_latents, vae).cpu(), overlapped_frames)
|
| 537 |
+
else:
|
| 538 |
+
real_history_latents = history_latents[:, :, :min(section_latent_frames, total_generated_latent_frames), :, :]
|
| 539 |
+
history_pixels = soft_append_bcthw(vae_decode(real_history_latents, vae).cpu(), history_pixels, overlapped_frames)
|
| 540 |
+
|
| 541 |
+
if not high_vram:
|
| 542 |
+
unload_complete_models(text_encoder, text_encoder_2, image_encoder, vae, transformer)
|
| 543 |
+
|
| 544 |
+
if enable_preview or section_index == (0 if first_section_index == (total_latent_sections - 1) else (total_latent_sections - 1)):
|
| 545 |
+
output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
|
| 546 |
+
|
| 547 |
+
save_bcthw_as_mp4(history_pixels, output_filename, fps=fps_number, crf=mp4_crf)
|
| 548 |
+
|
| 549 |
+
print(f'Decoded. Current latent shape pixel shape {history_pixels.shape}')
|
| 550 |
+
|
| 551 |
+
stream.output_queue.push(('file', output_filename))
|
| 552 |
+
return [total_generated_latent_frames, history_latents, history_pixels]
|
| 553 |
+
|
| 554 |
+
while section_index < total_latent_sections:
|
| 555 |
+
if stream.input_queue.top() == 'end':
|
| 556 |
+
stream.output_queue.push(('end', None))
|
| 557 |
+
return
|
| 558 |
+
|
| 559 |
+
print(f'section_index = {section_index}, total_latent_sections = {total_latent_sections}')
|
| 560 |
+
|
| 561 |
+
prompt_index = min(section_index, len(prompt_parameters) - 1)
|
| 562 |
+
|
| 563 |
+
[llama_vec, clip_l_pooler, llama_vec_n, clip_l_pooler_n, llama_attention_mask, llama_attention_mask_n] = prompt_parameters[prompt_index]
|
| 564 |
+
|
| 565 |
+
if prompt_index < len(prompt_parameters) - 1 or (prompt_index == total_latent_sections - 1):
|
| 566 |
+
del prompt_parameters[prompt_index]
|
| 567 |
+
|
| 568 |
+
if not high_vram:
|
| 569 |
+
unload_complete_models()
|
| 570 |
+
move_model_to_device_with_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=gpu_memory_preservation)
|
| 571 |
+
|
| 572 |
+
if use_teacache:
|
| 573 |
+
transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
|
| 574 |
+
else:
|
| 575 |
+
transformer.initialize_teacache(enable_teacache=False)
|
| 576 |
+
|
| 577 |
+
if forward:
|
| 578 |
+
clean_latents_4x, clean_latents_2x, clean_latents_1x = history_latents[:, :, -(16 + 2 + 1):, :, :].split([16, 2, 1], dim=2)
|
| 579 |
+
clean_latents = torch.cat([start_latent, clean_latents_1x], dim=2)
|
| 580 |
+
else:
|
| 581 |
+
clean_latents_1x, clean_latents_2x, clean_latents_4x = history_latents[:, :, :(1 + 2 + 16), :, :].split([1, 2, 16], dim=2)
|
| 582 |
+
clean_latents = torch.cat([clean_latents_1x, start_latent], dim=2)
|
| 583 |
+
|
| 584 |
+
generated_latents = sample_hunyuan(
|
| 585 |
+
transformer=transformer,
|
| 586 |
+
sampler='unipc',
|
| 587 |
+
width=width,
|
| 588 |
+
height=height,
|
| 589 |
+
frames=latent_window_size * 4 - 3,
|
| 590 |
+
real_guidance_scale=cfg,
|
| 591 |
+
distilled_guidance_scale=gs,
|
| 592 |
+
guidance_rescale=rs,
|
| 593 |
+
# shift=3.0,
|
| 594 |
+
num_inference_steps=steps,
|
| 595 |
+
generator=rnd,
|
| 596 |
+
prompt_embeds=llama_vec,
|
| 597 |
+
prompt_embeds_mask=llama_attention_mask,
|
| 598 |
+
prompt_poolers=clip_l_pooler,
|
| 599 |
+
negative_prompt_embeds=llama_vec_n,
|
| 600 |
+
negative_prompt_embeds_mask=llama_attention_mask_n,
|
| 601 |
+
negative_prompt_poolers=clip_l_pooler_n,
|
| 602 |
+
device=gpu,
|
| 603 |
+
dtype=torch.bfloat16,
|
| 604 |
+
image_embeddings=image_encoder_last_hidden_state,
|
| 605 |
+
latent_indices=latent_indices,
|
| 606 |
+
clean_latents=clean_latents,
|
| 607 |
+
clean_latent_indices=clean_latent_indices,
|
| 608 |
+
clean_latents_2x=clean_latents_2x,
|
| 609 |
+
clean_latent_2x_indices=clean_latent_2x_indices,
|
| 610 |
+
clean_latents_4x=clean_latents_4x,
|
| 611 |
+
clean_latent_4x_indices=clean_latent_4x_indices,
|
| 612 |
+
callback=callback,
|
| 613 |
+
)
|
| 614 |
+
del clean_latents
|
| 615 |
+
del clean_latents_2x
|
| 616 |
+
del clean_latents_4x
|
| 617 |
+
del latent_indices
|
| 618 |
+
del clean_latent_indices
|
| 619 |
+
del clean_latent_2x_indices
|
| 620 |
+
del clean_latent_4x_indices
|
| 621 |
+
|
| 622 |
+
[total_generated_latent_frames, history_latents, history_pixels] = post_process(forward, generated_latents, total_generated_latent_frames, history_latents, high_vram, transformer, gpu, vae, history_pixels, latent_window_size, enable_preview, section_index, total_latent_sections, outputs_folder, mp4_crf, stream)
|
| 623 |
+
|
| 624 |
+
if not forward:
|
| 625 |
+
if section_index > 0:
|
| 626 |
+
section_index -= 1
|
| 627 |
+
else:
|
| 628 |
+
clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices = indices.split([1, 16, 2, 1, latent_window_size], dim=1)
|
| 629 |
+
clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
|
| 630 |
+
|
| 631 |
+
real_history_latents = history_latents[:, :, :total_generated_latent_frames, :, :]
|
| 632 |
+
zero_latents = history_latents[:, :, total_generated_latent_frames:, :, :]
|
| 633 |
+
history_latents = torch.cat([zero_latents, real_history_latents], dim=2)
|
| 634 |
+
del real_history_latents
|
| 635 |
+
del zero_latents
|
| 636 |
+
|
| 637 |
+
forward = True
|
| 638 |
+
section_index = first_section_index
|
| 639 |
+
|
| 640 |
+
if forward:
|
| 641 |
+
section_index += 1
|
| 642 |
+
|
| 643 |
+
seed = (seed + 1) % np.iinfo(np.int32).max
|
| 644 |
except:
|
| 645 |
traceback.print_exc()
|
| 646 |
|
|
|
|
| 653 |
return
|
| 654 |
|
| 655 |
@torch.no_grad()
|
| 656 |
+
def worker_start_end(input_image, end_image, image_position, end_stillness, prompts, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number):
|
| 657 |
def encode_prompt(prompt, n_prompt):
|
| 658 |
llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
|
| 659 |
|
|
|
|
| 673 |
|
| 674 |
total_latent_sections = (total_second_length * fps_number) / (latent_window_size * 4)
|
| 675 |
total_latent_sections = int(max(round(total_latent_sections), 1))
|
| 676 |
+
|
|
|
|
| 677 |
|
| 678 |
stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
|
| 679 |
|
|
|
|
| 733 |
load_model_as_complete(vae, target_device=gpu)
|
| 734 |
|
| 735 |
start_latent = vae_encode(input_image_pt, vae)
|
| 736 |
+
del input_image_pt
|
| 737 |
|
| 738 |
if has_end_image:
|
| 739 |
end_latent = vae_encode(end_image_pt, vae)
|
| 740 |
+
del end_image_pt
|
| 741 |
|
| 742 |
# CLIP Vision
|
| 743 |
stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'CLIP Vision encoding ...'))))
|
|
|
|
| 746 |
load_model_as_complete(image_encoder, target_device=gpu)
|
| 747 |
|
| 748 |
image_encoder_output = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder)
|
| 749 |
+
del input_image_np
|
| 750 |
image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
|
| 751 |
|
| 752 |
if has_end_image:
|
|
|
|
| 770 |
# Dtype
|
| 771 |
image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
|
| 772 |
|
| 773 |
+
for idx in range(batch):
|
| 774 |
+
if batch > 1:
|
| 775 |
+
print(f"Beginning video {idx+1} of {batch} with seed {seed} ")
|
|
|
|
|
|
|
| 776 |
|
| 777 |
+
job_id = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")+f"_framepackse_{width}-{total_second_length}sec_seed-{seed}_steps-{steps}_distilled-{gs}_cfg-{cfg}"
|
|
|
|
|
|
|
|
|
|
| 778 |
|
| 779 |
+
# Sampling
|
| 780 |
+
stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Start sampling ...'))))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 781 |
|
| 782 |
+
rnd = torch.Generator("cpu").manual_seed(seed)
|
| 783 |
+
num_frames = latent_window_size * 4 - 3
|
| 784 |
|
| 785 |
+
history_latents = torch.zeros(size=(1, 16, 1 + 2 + 16, height // 8, width // 8), dtype=torch.float32, device=cpu)
|
| 786 |
+
start_latent = start_latent.to(history_latents)
|
| 787 |
+
if has_end_image:
|
| 788 |
+
end_latent = end_latent.to(history_latents)
|
| 789 |
+
|
| 790 |
+
history_pixels = None
|
| 791 |
+
total_generated_latent_frames = 0
|
| 792 |
|
| 793 |
+
if total_latent_sections > 4:
|
| 794 |
+
# In theory the latent_paddings should follow the else sequence, but it seems that duplicating some
|
| 795 |
+
# items looks better than expanding it when total_latent_sections > 4
|
| 796 |
+
# One can try to remove below trick and just
|
| 797 |
+
# use `latent_paddings = list(reversed(range(total_latent_sections)))` to compare
|
| 798 |
+
latent_paddings = [3] + [2] * (total_latent_sections - 3) + [1, 0]
|
| 799 |
else:
|
| 800 |
+
# Convert an iterator to a list
|
| 801 |
+
latent_paddings = list(range(total_latent_sections - 1, -1, -1))
|
| 802 |
|
| 803 |
+
if enable_preview:
|
| 804 |
+
def callback(d):
|
| 805 |
+
preview = d['denoised']
|
| 806 |
+
preview = vae_decode_fake(preview)
|
| 807 |
+
|
| 808 |
+
preview = (preview * 255.0).detach().cpu().numpy().clip(0, 255).astype(np.uint8)
|
| 809 |
+
preview = einops.rearrange(preview, 'b c t h w -> (b h) (t w) c')
|
| 810 |
+
|
| 811 |
+
if stream.input_queue.top() == 'end':
|
| 812 |
+
stream.output_queue.push(('end', None))
|
| 813 |
+
raise KeyboardInterrupt('User ends the task.')
|
| 814 |
+
|
| 815 |
+
current_step = d['i'] + 1
|
| 816 |
+
percentage = int(100.0 * current_step / steps)
|
| 817 |
+
hint = f'Sampling {current_step}/{steps}'
|
| 818 |
+
desc = f'Total generated frames: {int(max(0, total_generated_latent_frames * 4 - 3))}, Video length: {max(0, (total_generated_latent_frames * 4 - 3) / fps_number) :.2f} seconds (FPS-30), Resolution: {height}px * {width}px, Video {idx+1} of {batch}. The video is being extended now ...'
|
| 819 |
+
stream.output_queue.push(('progress', (preview, desc, make_progress_bar_html(percentage, hint))))
|
| 820 |
+
return
|
| 821 |
+
else:
|
| 822 |
+
def callback(d):
|
| 823 |
+
return
|
| 824 |
|
| 825 |
+
def post_process(job_id, start_latent, generated_latents, total_generated_latent_frames, history_latents, high_vram, transformer, gpu, vae, history_pixels, latent_window_size, enable_preview, outputs_folder, mp4_crf, stream, is_last_section):
|
| 826 |
+
if is_last_section:
|
| 827 |
+
generated_latents = torch.cat([start_latent.to(generated_latents), generated_latents], dim=2)
|
| 828 |
|
| 829 |
+
total_generated_latent_frames += int(generated_latents.shape[2])
|
| 830 |
+
history_latents = torch.cat([generated_latents.to(history_latents), history_latents], dim=2)
|
| 831 |
|
| 832 |
+
if not high_vram:
|
| 833 |
+
offload_model_from_device_for_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=8)
|
| 834 |
+
load_model_as_complete(vae, target_device=gpu)
|
| 835 |
|
| 836 |
+
if history_pixels is None:
|
| 837 |
+
history_pixels = vae_decode(history_latents[:, :, :total_generated_latent_frames, :, :], vae).cpu()
|
| 838 |
+
else:
|
| 839 |
+
section_latent_frames = (latent_window_size * 2 + 1) if is_last_section else (latent_window_size * 2)
|
| 840 |
+
overlapped_frames = latent_window_size * 4 - 3
|
| 841 |
|
| 842 |
+
current_pixels = vae_decode(history_latents[:, :, :min(total_generated_latent_frames, section_latent_frames)], vae).cpu()
|
| 843 |
+
history_pixels = soft_append_bcthw(current_pixels, history_pixels, overlapped_frames)
|
| 844 |
+
|
| 845 |
+
if not high_vram:
|
| 846 |
+
unload_complete_models(vae)
|
| 847 |
+
|
| 848 |
+
if enable_preview or is_last_section:
|
| 849 |
+
output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
|
| 850 |
+
|
| 851 |
+
save_bcthw_as_mp4(history_pixels, output_filename, fps=fps_number, crf=mp4_crf)
|
| 852 |
+
|
| 853 |
+
print(f'Decoded. Pixel shape {history_pixels.shape}')
|
| 854 |
+
|
| 855 |
+
stream.output_queue.push(('file', output_filename))
|
| 856 |
+
|
| 857 |
+
return [total_generated_latent_frames, history_latents, history_pixels]
|
|
|
|
|
|
|
|
|
|
| 858 |
|
| 859 |
+
for latent_padding in latent_paddings:
|
| 860 |
+
is_last_section = latent_padding == 0
|
| 861 |
+
is_first_section = latent_padding == latent_paddings[0]
|
| 862 |
+
latent_padding_size = latent_padding * latent_window_size
|
| 863 |
+
|
| 864 |
+
if stream.input_queue.top() == 'end':
|
| 865 |
+
stream.output_queue.push(('end', None))
|
| 866 |
+
return
|
| 867 |
+
|
| 868 |
+
print(f'latent_padding_size = {latent_padding_size}, is_last_section = {is_last_section}, is_first_section = {is_first_section}')
|
| 869 |
+
|
| 870 |
+
if len(prompt_parameters) > 0:
|
| 871 |
+
[llama_vec, clip_l_pooler, llama_vec_n, clip_l_pooler_n, llama_attention_mask, llama_attention_mask_n] = prompt_parameters.pop(len(prompt_parameters) - 1)
|
| 872 |
+
|
| 873 |
+
indices = torch.arange(1 + latent_padding_size + latent_window_size + 1 + (end_stillness if is_first_section else 0) + 2 + 16).unsqueeze(0)
|
| 874 |
+
clean_latent_indices_pre, blank_indices, latent_indices, clean_latent_indices_post, clean_latent_2x_indices, clean_latent_4x_indices = indices.split([1, latent_padding_size, latent_window_size, 1 + (end_stillness if is_first_section else 0), 2, 16], dim=1)
|
| 875 |
+
clean_latent_indices = torch.cat([clean_latent_indices_pre, clean_latent_indices_post], dim=1)
|
| 876 |
+
|
| 877 |
+
clean_latents_post, clean_latents_2x, clean_latents_4x = history_latents[:, :, :1 + 2 + 16, :, :].split([1, 2, 16], dim=2)
|
| 878 |
+
|
| 879 |
+
# Use end image latent for the first section if provided
|
| 880 |
+
if has_end_image and is_first_section:
|
| 881 |
+
clean_latents_post = end_latent.expand(-1, -1, 1 + end_stillness, -1, -1)
|
| 882 |
+
|
| 883 |
+
clean_latents = torch.cat([start_latent, clean_latents_post], dim=2)
|
| 884 |
+
|
| 885 |
+
if not high_vram:
|
| 886 |
+
unload_complete_models()
|
| 887 |
+
move_model_to_device_with_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=gpu_memory_preservation)
|
| 888 |
+
|
| 889 |
+
if use_teacache:
|
| 890 |
+
transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
|
| 891 |
+
else:
|
| 892 |
+
transformer.initialize_teacache(enable_teacache=False)
|
| 893 |
+
|
| 894 |
+
generated_latents = sample_hunyuan(
|
| 895 |
+
transformer=transformer,
|
| 896 |
+
sampler='unipc',
|
| 897 |
+
width=width,
|
| 898 |
+
height=height,
|
| 899 |
+
frames=num_frames,
|
| 900 |
+
real_guidance_scale=cfg,
|
| 901 |
+
distilled_guidance_scale=gs,
|
| 902 |
+
guidance_rescale=rs,
|
| 903 |
+
# shift=3.0,
|
| 904 |
+
num_inference_steps=steps,
|
| 905 |
+
generator=rnd,
|
| 906 |
+
prompt_embeds=llama_vec,
|
| 907 |
+
prompt_embeds_mask=llama_attention_mask,
|
| 908 |
+
prompt_poolers=clip_l_pooler,
|
| 909 |
+
negative_prompt_embeds=llama_vec_n,
|
| 910 |
+
negative_prompt_embeds_mask=llama_attention_mask_n,
|
| 911 |
+
negative_prompt_poolers=clip_l_pooler_n,
|
| 912 |
+
device=gpu,
|
| 913 |
+
dtype=torch.bfloat16,
|
| 914 |
+
image_embeddings=image_encoder_last_hidden_state,
|
| 915 |
+
latent_indices=latent_indices,
|
| 916 |
+
clean_latents=clean_latents,
|
| 917 |
+
clean_latent_indices=clean_latent_indices,
|
| 918 |
+
clean_latents_2x=clean_latents_2x,
|
| 919 |
+
clean_latent_2x_indices=clean_latent_2x_indices,
|
| 920 |
+
clean_latents_4x=clean_latents_4x,
|
| 921 |
+
clean_latent_4x_indices=clean_latent_4x_indices,
|
| 922 |
+
callback=callback,
|
| 923 |
+
)
|
| 924 |
+
del clean_latents
|
| 925 |
+
del clean_latents_2x
|
| 926 |
+
del clean_latents_4x
|
| 927 |
+
del latent_indices
|
| 928 |
+
del clean_latent_indices
|
| 929 |
+
del clean_latent_2x_indices
|
| 930 |
+
del clean_latent_4x_indices
|
| 931 |
+
|
| 932 |
+
[total_generated_latent_frames, history_latents, history_pixels] = post_process(job_id, start_latent, generated_latents, total_generated_latent_frames, history_latents, high_vram, transformer, gpu, vae, history_pixels, latent_window_size, enable_preview, outputs_folder, mp4_crf, stream, is_last_section)
|
| 933 |
+
|
| 934 |
+
if is_last_section:
|
| 935 |
+
break
|
| 936 |
|
| 937 |
+
seed = (seed + 1) % np.iinfo(np.int32).max
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 938 |
except:
|
| 939 |
traceback.print_exc()
|
| 940 |
|
|
|
|
| 1131 |
if batch > 1:
|
| 1132 |
print(f"Beginning video {idx+1} of {batch} with seed {seed} ")
|
| 1133 |
|
|
|
|
| 1134 |
job_id = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")+f"_framepackf1-videoinput_{width}-{total_second_length}sec_seed-{seed}_steps-{steps}_distilled-{gs}_cfg-{cfg}" # 20250506 pftq: easier to read timestamp and filename
|
| 1135 |
|
| 1136 |
# Sampling
|
|
|
|
| 1146 |
|
| 1147 |
# 20250509 Generate backwards with end frame for better end frame anchoring
|
| 1148 |
if total_latent_sections > 4:
|
| 1149 |
+
latent_paddings = [3, 2] + [1] * (total_latent_sections - 3) + [0]
|
| 1150 |
else:
|
| 1151 |
latent_paddings = list(reversed(range(total_latent_sections)))
|
| 1152 |
|
|
|
|
| 1267 |
stream.output_queue.push(('end', None))
|
| 1268 |
return
|
| 1269 |
|
| 1270 |
+
def get_duration(input_image, end_image, image_position, end_stillness, prompts, generation_mode, n_prompt, seed, batch, resolution, total_second_length, allocation_time, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number):
|
| 1271 |
return allocation_time
|
| 1272 |
|
| 1273 |
@spaces.GPU(duration=get_duration)
|
| 1274 |
+
def process_on_gpu(input_image, end_image, image_position, end_stillness, prompts, generation_mode, n_prompt, seed, batch, resolution, total_second_length, allocation_time, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number
|
| 1275 |
):
|
| 1276 |
start = time.time()
|
| 1277 |
global stream
|
| 1278 |
stream = AsyncStream()
|
| 1279 |
|
| 1280 |
+
async_run(worker_start_end if generation_mode == "start_end" else worker, input_image, end_image, image_position, end_stillness, prompts, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number)
|
| 1281 |
|
| 1282 |
output_filename = None
|
| 1283 |
+
output_filenames = ""
|
| 1284 |
|
| 1285 |
while True:
|
| 1286 |
flag, data = stream.output_queue.next()
|
| 1287 |
|
| 1288 |
if flag == 'file':
|
| 1289 |
output_filename = data
|
| 1290 |
+
output_filenames = output_filenames + ";" + str(output_filename)
|
| 1291 |
+
print("output_filename=" + str(output_filename))
|
| 1292 |
+
yield gr.update(value=output_filename, label="Previewed Frames"), gr.update(value=output_filenames, visible=True), gr.skip(), gr.skip(), gr.skip(), gr.update(interactive=False), gr.update(interactive=True), gr.skip()
|
| 1293 |
|
| 1294 |
if flag == 'progress':
|
| 1295 |
preview, desc, html = data
|
| 1296 |
+
yield gr.update(label="Previewed Frames"), gr.skip(), gr.skip(), gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True), gr.skip()
|
| 1297 |
|
| 1298 |
if flag == 'end':
|
| 1299 |
end = time.time()
|
|
|
|
| 1302 |
secondes = secondes - (minutes * 60)
|
| 1303 |
hours = math.floor(minutes / 60)
|
| 1304 |
minutes = minutes - (hours * 60)
|
| 1305 |
+
yield gr.update(value=output_filename, label="Finished Frames"), gr.update(value=output_filenames, visible=True), gr.update(visible=False), gr.skip(), "The process has lasted " + \
|
| 1306 |
((str(hours) + " h, ") if hours != 0 else "") + \
|
| 1307 |
((str(minutes) + " min, ") if hours != 0 or minutes != 0 else "") + \
|
| 1308 |
str(secondes) + " sec. " + \
|
|
|
|
| 1320 |
seed=31337,
|
| 1321 |
auto_allocation=True,
|
| 1322 |
allocation_time=180,
|
| 1323 |
+
batch=1,
|
| 1324 |
resolution=640,
|
| 1325 |
total_second_length=5,
|
| 1326 |
latent_window_size=9,
|
|
|
|
| 1339 |
|
| 1340 |
if torch.cuda.device_count() == 0:
|
| 1341 |
gr.Warning('Set this space to GPU config to make it work.')
|
| 1342 |
+
yield gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.update(visible = False)
|
| 1343 |
return
|
| 1344 |
|
| 1345 |
if randomize_seed:
|
|
|
|
| 1354 |
assert input_image is not None, 'No input image!'
|
| 1355 |
assert (generation_mode != "start_end") or end_image is not None, 'No end image!'
|
| 1356 |
|
| 1357 |
+
yield gr.update(label="Previewed Frames"), gr.update(value = ""), None, '', '', gr.update(interactive=False), gr.update(interactive=True), gr.skip()
|
| 1358 |
|
| 1359 |
gc.collect()
|
| 1360 |
yield from process_on_gpu(input_image,
|
|
|
|
| 1365 |
generation_mode,
|
| 1366 |
n_prompt,
|
| 1367 |
seed,
|
| 1368 |
+
batch,
|
| 1369 |
resolution,
|
| 1370 |
total_second_length,
|
| 1371 |
allocation_time,
|
|
|
|
| 1394 |
async_run(worker_video, input_video, end_frame, end_stillness, prompts, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch)
|
| 1395 |
|
| 1396 |
output_filename = None
|
| 1397 |
+
output_filenames = ""
|
| 1398 |
|
| 1399 |
while True:
|
| 1400 |
flag, data = stream.output_queue.next()
|
| 1401 |
|
| 1402 |
if flag == 'file':
|
| 1403 |
output_filename = data
|
| 1404 |
+
print("output_filename=" + str(output_filename))
|
| 1405 |
+
output_filenames = output_filenames + ";" + str(output_filename)
|
| 1406 |
+
yield gr.update(value=output_filename, label="Previewed Frames"), gr.update(value=output_filenames, visible=True), gr.skip(), gr.skip(), gr.skip(), gr.update(interactive=False), gr.update(interactive=True), gr.skip()
|
| 1407 |
|
| 1408 |
if flag == 'progress':
|
| 1409 |
preview, desc, html = data
|
| 1410 |
+
yield gr.update(label="Previewed Frames"), gr.skip(), gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True), gr.skip() # 20250506 pftq: Keep refreshing the video in case it got hidden when the tab was in the background
|
| 1411 |
|
| 1412 |
if flag == 'end':
|
| 1413 |
end = time.time()
|
|
|
|
| 1416 |
secondes = secondes - (minutes * 60)
|
| 1417 |
hours = math.floor(minutes / 60)
|
| 1418 |
minutes = minutes - (hours * 60)
|
| 1419 |
+
yield gr.update(value=output_filename, label="Finished Frames"), gr.update(value=output_filenames, visible=True), gr.update(visible=False), desc + \
|
| 1420 |
" The process has lasted " + \
|
| 1421 |
((str(hours) + " h, ") if hours != 0 else "") + \
|
| 1422 |
((str(minutes) + " min, ") if hours != 0 or minutes != 0 else "") + \
|
|
|
|
| 1431 |
|
| 1432 |
if torch.cuda.device_count() == 0:
|
| 1433 |
gr.Warning('Set this space to GPU config to make it work.')
|
| 1434 |
+
yield gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.update(visible = False)
|
| 1435 |
return
|
| 1436 |
|
| 1437 |
if randomize_seed:
|
|
|
|
| 1442 |
# 20250506 pftq: Updated assertion for video input
|
| 1443 |
assert input_video is not None, 'No input video!'
|
| 1444 |
|
| 1445 |
+
yield gr.update(label="Previewed Frames"), gr.update(value = ""), None, '', '', gr.update(interactive=False), gr.update(interactive=True), gr.skip()
|
| 1446 |
|
| 1447 |
# 20250507 pftq: Even the H100 needs offloading if the video dimensions are 720p or higher
|
| 1448 |
if high_vram and (no_resize or resolution>640):
|
|
|
|
| 1557 |
enable_preview = gr.Checkbox(label='Enable preview', value=True, info='Display a preview around each second generated but it costs 2 sec. for each second generated.')
|
| 1558 |
use_teacache = gr.Checkbox(label='Use TeaCache', value=False, info='Faster speed and no break in brightness, but often makes hands and fingers slightly worse.')
|
| 1559 |
|
| 1560 |
+
n_prompt = gr.Textbox(label="Negative Prompt", value="Missing arm, long hand, sagging skin, unjustified body movement, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth, abrupt change in camera trajectory, jumpcut, crossfader, crossfading", info='Requires using normal CFG (undistilled) instead of Distilled (set Distilled=1 and CFG > 1).')
|
| 1561 |
|
| 1562 |
fps_number = gr.Slider(label="Frame per seconds", info="The model is trained for 30 fps so other fps may generate weird results", minimum=10, maximum=60, value=30, step=1)
|
| 1563 |
end_stillness = gr.Slider(label="End stillness", minimum=0, maximum=100, value=0, step=1, info='0=Realistic end; >0=Matches exactly the end image (but the time seems to freeze)')
|
|
|
|
| 1570 |
resolution = gr.Dropdown([
|
| 1571 |
["409,600 px (working)", 640],
|
| 1572 |
["451,584 px (working)", 672],
|
| 1573 |
+
["495,616 px (working for extension)", 704],
|
| 1574 |
["589,824 px (not tested)", 768],
|
| 1575 |
["692,224 px (not tested)", 832],
|
| 1576 |
["746,496 px (not tested)", 864],
|
|
|
|
| 1598 |
gpu_memory_preservation = gr.Slider(label="GPU Inference Preserved Memory (GB) (larger means slower)", minimum=6, maximum=128, value=6, step=0.1, info="Set this number to a larger value if you encounter OOM. Larger value causes slower speed.")
|
| 1599 |
|
| 1600 |
mp4_crf = gr.Slider(label="MP4 Compression", minimum=0, maximum=100, value=16, step=1, info="Lower means better quality. 0 is uncompressed. Change to 16 if you get black outputs. ")
|
| 1601 |
+
batch = gr.Slider(label="Batch Size (number of videos)", minimum=1, maximum=1000, value=1, step=1, info='Generate multiple videos each with a different seed')
|
| 1602 |
with gr.Row():
|
| 1603 |
randomize_seed = gr.Checkbox(label='Randomize seed', value=True, info='If checked, the seed is always different')
|
| 1604 |
seed = gr.Slider(label="Seed", minimum=0, maximum=np.iinfo(np.int32).max, step=1, randomize=True)
|
|
|
|
| 1608 |
|
| 1609 |
with gr.Column():
|
| 1610 |
warning = gr.HTML(elem_id="warning", value = "<center><big>Your computer must <u>not</u> enter into standby mode.</big><br/>On Chrome, you can force to keep a tab alive in <code>chrome://discards/</code></center>", visible = False)
|
| 1611 |
+
result_video = gr.Video(label="Generated Frames", autoplay = True, show_share_button = False, height = 512, loop = True)
|
| 1612 |
+
download_textbox = gr.HTML(label="Download list", visible = False)
|
| 1613 |
+
|
| 1614 |
+
@gr.render(inputs=download_textbox)
|
| 1615 |
+
def show_split(download_textbox):
|
| 1616 |
+
if len(download_textbox) > 0:
|
| 1617 |
+
pathes = download_textbox.split(";")[1:]
|
| 1618 |
+
for one_path in pathes:
|
| 1619 |
+
one_download_button = gr.DownloadButton(label="Download", value=one_path)
|
| 1620 |
+
|
| 1621 |
+
preview_image = gr.Image(label="Next Latents", height = 200, visible = False)
|
| 1622 |
progress_desc = gr.Markdown('', elem_classes='no-generating-animation')
|
| 1623 |
progress_bar = gr.HTML('', elem_classes='no-generating-animation')
|
| 1624 |
|
| 1625 |
+
ips = [input_image, end_image, image_position, end_stillness, final_prompt, generation_mode, n_prompt, randomize_seed, seed, auto_allocation, allocation_time, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number]
|
| 1626 |
ips_video = [input_video, end_image, end_stillness, final_prompt, n_prompt, randomize_seed, seed, auto_allocation, allocation_time, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch]
|
| 1627 |
|
| 1628 |
gr.Examples(
|
|
|
|
| 1635 |
1, # end_stillness
|
| 1636 |
"Overcrowed street in Japan, photorealistic, realistic, intricate details, 8k, insanely detailed",
|
| 1637 |
"text", # generation_mode
|
| 1638 |
+
"Missing arm, long hand, sagging skin, unjustified body movement, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth, abrupt change in camera trajectory, jumpcut, crossfader, crossfading", # n_prompt
|
| 1639 |
True, # randomize_seed
|
| 1640 |
42, # seed
|
| 1641 |
True, # auto_allocation
|
| 1642 |
180, # allocation_time
|
| 1643 |
+
1, # batch
|
| 1644 |
672, # resolution
|
| 1645 |
1, # total_second_length
|
| 1646 |
9, # latent_window_size
|
|
|
|
| 1658 |
run_on_click = True,
|
| 1659 |
fn = process,
|
| 1660 |
inputs = ips,
|
| 1661 |
+
outputs = [result_video, download_textbox, preview_image, progress_desc, progress_bar, start_button, end_button, warning],
|
| 1662 |
cache_examples = False,
|
| 1663 |
)
|
| 1664 |
|
|
|
|
| 1672 |
1, # end_stillness
|
| 1673 |
"A dolphin emerges from the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
|
| 1674 |
"image", # generation_mode
|
| 1675 |
+
"Missing arm, long hand, sagging skin, unjustified body movement, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth, abrupt change in camera trajectory, jumpcut, crossfader, crossfading", # n_prompt
|
| 1676 |
True, # randomize_seed
|
| 1677 |
42, # seed
|
| 1678 |
True, # auto_allocation
|
| 1679 |
180, # allocation_time
|
| 1680 |
+
1, # batch
|
| 1681 |
672, # resolution
|
| 1682 |
1, # total_second_length
|
| 1683 |
9, # latent_window_size
|
|
|
|
| 1698 |
1, # end_stillness
|
| 1699 |
"A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The man talks and the woman listens; A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks, the man stops talking and the man listens; A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks and the man listens",
|
| 1700 |
"image", # generation_mode
|
| 1701 |
+
"Missing arm, long hand, sagging skin, unjustified body movement, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth, abrupt change in camera trajectory, jumpcut, crossfader, crossfading", # n_prompt
|
| 1702 |
True, # randomize_seed
|
| 1703 |
42, # seed
|
| 1704 |
True, # auto_allocation
|
| 1705 |
180, # allocation_time
|
| 1706 |
+
1, # batch
|
| 1707 |
672, # resolution
|
| 1708 |
2, # total_second_length
|
| 1709 |
9, # latent_window_size
|
|
|
|
| 1724 |
1, # end_stillness
|
| 1725 |
"A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks and the man listens; A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The man talks, the woman stops talking and the woman listens A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The man talks and the woman listens",
|
| 1726 |
"image", # generation_mode
|
| 1727 |
+
"Missing arm, long hand, sagging skin, unjustified body movement, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth, abrupt change in camera trajectory, jumpcut, crossfader, crossfading", # n_prompt
|
| 1728 |
True, # randomize_seed
|
| 1729 |
42, # seed
|
| 1730 |
True, # auto_allocation
|
| 1731 |
180, # allocation_time
|
| 1732 |
+
1, # batch
|
| 1733 |
672, # resolution
|
| 1734 |
2, # total_second_length
|
| 1735 |
9, # latent_window_size
|
|
|
|
| 1755 |
42, # seed
|
| 1756 |
True, # auto_allocation
|
| 1757 |
180, # allocation_time
|
| 1758 |
+
1, # batch
|
| 1759 |
672, # resolution
|
| 1760 |
1, # total_second_length
|
| 1761 |
9, # latent_window_size
|
|
|
|
| 1776 |
1, # end_stillness
|
| 1777 |
"A building starting to explode, photorealistic, realisitc, 8k, insanely detailed",
|
| 1778 |
"image", # generation_mode
|
| 1779 |
+
"Missing arm, long hand, sagging skin, unjustified body movement, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth, abrupt change in camera trajectory, jumpcut, crossfader, crossfading", # n_prompt
|
| 1780 |
True, # randomize_seed
|
| 1781 |
42, # seed
|
| 1782 |
True, # auto_allocation
|
| 1783 |
180, # allocation_time
|
| 1784 |
+
1, # batch
|
| 1785 |
672, # resolution
|
| 1786 |
1, # total_second_length
|
| 1787 |
9, # latent_window_size
|
|
|
|
| 1799 |
run_on_click = True,
|
| 1800 |
fn = process,
|
| 1801 |
inputs = ips,
|
| 1802 |
+
outputs = [result_video, download_textbox, preview_image, progress_desc, progress_bar, start_button, end_button, warning],
|
| 1803 |
cache_examples = False,
|
| 1804 |
)
|
| 1805 |
|
|
|
|
| 1813 |
0, # end_stillness
|
| 1814 |
"A woman jumps out of the train and arrives on the ground, viewed from the outside, photorealistic, realistic, amateur photography, midday, insanely detailed, 8k", # prompt
|
| 1815 |
"start_end", # generation_mode
|
| 1816 |
+
"Missing arm, long hand, sagging skin, unjustified body movement, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth, abrupt change in camera trajectory, jumpcut, crossfader, crossfading", # n_prompt
|
| 1817 |
True, # randomize_seed
|
| 1818 |
42, # seed
|
| 1819 |
True, # auto_allocation
|
| 1820 |
180, # allocation_time
|
| 1821 |
+
1, # batch
|
| 1822 |
672, # resolution
|
| 1823 |
1, # total_second_length
|
| 1824 |
9, # latent_window_size
|
|
|
|
| 1836 |
run_on_click = True,
|
| 1837 |
fn = process,
|
| 1838 |
inputs = ips,
|
| 1839 |
+
outputs = [result_video, download_textbox, preview_image, progress_desc, progress_bar, start_button, end_button, warning],
|
| 1840 |
cache_examples = False,
|
| 1841 |
)
|
| 1842 |
|
|
|
|
| 1848 |
None, # end_image
|
| 1849 |
1, # end_stillness
|
| 1850 |
"View of the sea as far as the eye can see, from the seaside, a piece of land is barely visible on the horizon at the middle, the sky is radiant, reflections of the sun in the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
|
| 1851 |
+
"Missing arm, long hand, sagging skin, unjustified body movement, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth, abrupt change in camera trajectory, jumpcut, crossfader, crossfading", # n_prompt
|
| 1852 |
True, # randomize_seed
|
| 1853 |
42, # seed
|
| 1854 |
True, # auto_allocation
|
|
|
|
| 1874 |
"./img_examples/Example1.png", # end_image
|
| 1875 |
1, # end_stillness
|
| 1876 |
"View of the sea as far as the eye can see, from the seaside, a piece of land is barely visible on the horizon at the middle, the sky is radiant, reflections of the sun in the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
|
| 1877 |
+
"Missing arm, long hand, sagging skin, unjustified body movement, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth, abrupt change in camera trajectory, jumpcut, crossfader, crossfading", # n_prompt
|
| 1878 |
True, # randomize_seed
|
| 1879 |
42, # seed
|
| 1880 |
True, # auto_allocation
|
|
|
|
| 1899 |
run_on_click = True,
|
| 1900 |
fn = process_video,
|
| 1901 |
inputs = ips_video,
|
| 1902 |
+
outputs = [result_video, download_textbox, preview_image, progress_desc, progress_bar, start_button_video, end_button, warning],
|
| 1903 |
cache_examples = False,
|
| 1904 |
)
|
| 1905 |
|
|
|
|
| 1937 |
gr.update(visible = True), # start_button
|
| 1938 |
gr.update(visible = False), # start_button_video
|
| 1939 |
gr.update(visible = False), # no_resize
|
|
|
|
| 1940 |
gr.update(visible = False), # num_clean_frames
|
| 1941 |
gr.update(visible = False), # vae_batch
|
| 1942 |
gr.update(visible = False), # prompt_hint
|
|
|
|
| 1953 |
gr.update(visible = True), # start_button
|
| 1954 |
gr.update(visible = False), # start_button_video
|
| 1955 |
gr.update(visible = False), # no_resize
|
|
|
|
| 1956 |
gr.update(visible = False), # num_clean_frames
|
| 1957 |
gr.update(visible = False), # vae_batch
|
| 1958 |
gr.update(visible = False), # prompt_hint
|
|
|
|
| 1969 |
gr.update(visible = True), # start_button
|
| 1970 |
gr.update(visible = False), # start_button_video
|
| 1971 |
gr.update(visible = False), # no_resize
|
|
|
|
| 1972 |
gr.update(visible = False), # num_clean_frames
|
| 1973 |
gr.update(visible = False), # vae_batch
|
| 1974 |
gr.update(visible = False), # prompt_hint
|
|
|
|
| 1985 |
gr.update(visible = False), # start_button
|
| 1986 |
gr.update(visible = True), # start_button_video
|
| 1987 |
gr.update(visible = True), # no_resize
|
|
|
|
| 1988 |
gr.update(visible = True), # num_clean_frames
|
| 1989 |
gr.update(visible = True), # vae_batch
|
| 1990 |
gr.update(visible = True), # prompt_hint
|
|
|
|
| 1995 |
timeless_prompt.change(fn=handle_timeless_prompt_change, inputs=[timeless_prompt], outputs=[final_prompt])
|
| 1996 |
start_button.click(fn = check_parameters, inputs = [
|
| 1997 |
generation_mode, input_image, input_video
|
| 1998 |
+
], outputs = [end_button, warning], queue = False, show_progress = False).success(fn=process, inputs=ips, outputs=[result_video, download_textbox, preview_image, progress_desc, progress_bar, start_button, end_button, warning], scroll_to_output = True)
|
| 1999 |
start_button_video.click(fn = check_parameters, inputs = [
|
| 2000 |
generation_mode, input_image, input_video
|
| 2001 |
+
], outputs = [end_button, warning], queue = False, show_progress = False).success(fn=process_video, inputs=ips_video, outputs=[result_video, download_textbox, preview_image, progress_desc, progress_bar, start_button_video, end_button, warning], scroll_to_output = True)
|
| 2002 |
end_button.click(fn=end_process)
|
| 2003 |
|
| 2004 |
generation_mode.change(fn = save_preferences, inputs = [
|
|
|
|
| 2011 |
generation_mode.change(
|
| 2012 |
fn=handle_generation_mode_change,
|
| 2013 |
inputs=[generation_mode],
|
| 2014 |
+
outputs=[text_to_video_hint, image_position, input_image, end_image, end_stillness, input_video, start_button, start_button_video, no_resize, num_clean_frames, vae_batch, prompt_hint, fps_number]
|
| 2015 |
)
|
| 2016 |
|
| 2017 |
# Update display when the page loads
|
|
|
|
| 2019 |
fn=handle_generation_mode_change, inputs = [
|
| 2020 |
generation_mode
|
| 2021 |
], outputs = [
|
| 2022 |
+
text_to_video_hint, image_position, input_image, end_image, end_stillness, input_video, start_button, start_button_video, no_resize, num_clean_frames, vae_batch, prompt_hint, fps_number
|
| 2023 |
]
|
| 2024 |
)
|
| 2025 |
|