Fabrice-TIERCELIN commited on
Commit
8261bda
·
verified ·
1 Parent(s): bb5e9a5

Batch mode

Browse files
Files changed (1) hide show
  1. app.py +384 -350
app.py CHANGED
@@ -41,7 +41,7 @@ from PIL import Image
41
  from diffusers import AutoencoderKLHunyuanVideo
42
  from transformers import LlamaModel, CLIPTextModel, LlamaTokenizerFast, CLIPTokenizer
43
  from diffusers_helper.hunyuan import encode_prompt_conds, vae_decode, vae_encode, vae_decode_fake
44
- from diffusers_helper.utils import save_bcthw_as_mp4, crop_or_pad_yield_mask, soft_append_bcthw, resize_and_center_crop, state_dict_weighted_merge, state_dict_offset_merge, generate_timestamp
45
  from diffusers_helper.models.hunyuan_video_packed import HunyuanVideoTransformer3DModelPacked
46
  from diffusers_helper.pipelines.k_diffusion_hunyuan import sample_hunyuan
47
  if torch.cuda.device_count() > 0:
@@ -368,7 +368,7 @@ def image_encode(image_np, target_width, target_height, vae, image_encoder, feat
368
  raise
369
 
370
  @torch.no_grad()
371
- def worker(input_image, end_image, image_position, end_stillness, prompts, n_prompt, seed, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number):
372
  def encode_prompt(prompt, n_prompt):
373
  llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
374
 
@@ -393,8 +393,6 @@ def worker(input_image, end_image, image_position, end_stillness, prompts, n_pro
393
  section_index = first_section_index
394
  forward = (image_position == 0)
395
 
396
- job_id = generate_timestamp()
397
-
398
  stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
399
 
400
  try:
@@ -470,172 +468,179 @@ def worker(input_image, end_image, image_position, end_stillness, prompts, n_pro
470
 
471
  image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
472
 
473
- # Sampling
474
-
475
- stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Start sampling ...'))))
476
-
477
- rnd = torch.Generator("cpu").manual_seed(seed)
478
-
479
- history_latents = torch.zeros(size=(1, 16, 16 + 2 + 1, height // 8, width // 8), dtype=torch.float32, device=cpu)
480
- start_latent = start_latent.to(history_latents)
481
- history_pixels = None
482
-
483
- history_latents = torch.cat([history_latents, start_latent] if forward else [start_latent, history_latents], dim=2)
484
- total_generated_latent_frames = 1
485
-
486
- if enable_preview:
487
- def callback(d):
488
- preview = d['denoised']
489
- preview = vae_decode_fake(preview)
490
-
491
- preview = (preview * 255.0).detach().cpu().numpy().clip(0, 255).astype(np.uint8)
492
- preview = einops.rearrange(preview, 'b c t h w -> (b h) (t w) c')
493
-
494
- if stream.input_queue.top() == 'end':
495
- stream.output_queue.push(('end', None))
496
- raise KeyboardInterrupt('User ends the task.')
497
-
498
- current_step = d['i'] + 1
499
- percentage = int(100.0 * current_step / steps)
500
- hint = f'Sampling {current_step}/{steps}'
501
- desc = f'Total generated frames: {int(max(0, total_generated_latent_frames * 4 - 3))}, Video length: {max(0, (total_generated_latent_frames * 4 - 3) / fps_number) :.2f} seconds (FPS-30), Resolution: {height}px * {width}px. The video is being extended now ...'
502
- stream.output_queue.push(('progress', (preview, desc, make_progress_bar_html(percentage, hint))))
503
- return
504
- else:
505
- def callback(d):
506
- return
507
-
508
- indices = torch.arange(0, 1 + 16 + 2 + 1 + latent_window_size).unsqueeze(0)
509
- if forward:
510
- clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices = indices.split([1, 16, 2, 1, latent_window_size], dim=1)
511
- clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
512
- else:
513
- latent_indices, clean_latent_1x_indices, clean_latent_2x_indices, clean_latent_4x_indices, clean_latent_indices_start = indices.split([latent_window_size, 1, 2, 16, 1], dim=1)
514
- clean_latent_indices = torch.cat([clean_latent_1x_indices, clean_latent_indices_start], dim=1)
515
-
516
- def post_process(forward, generated_latents, total_generated_latent_frames, history_latents, high_vram, transformer, gpu, vae, history_pixels, latent_window_size, enable_preview, section_index, total_latent_sections, outputs_folder, mp4_crf, stream):
517
- total_generated_latent_frames += int(generated_latents.shape[2])
518
- history_latents = torch.cat([history_latents, generated_latents.to(history_latents)] if forward else [generated_latents.to(history_latents), history_latents], dim=2)
519
-
520
- if not high_vram:
521
- offload_model_from_device_for_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=8)
522
- load_model_as_complete(vae, target_device=gpu)
523
-
524
- if history_pixels is None:
525
- real_history_latents = history_latents[:, :, -total_generated_latent_frames:, :, :] if forward else history_latents[:, :, :total_generated_latent_frames, :, :]
526
- history_pixels = vae_decode(real_history_latents, vae).cpu()
527
- else:
528
- section_latent_frames = latent_window_size * 2
529
- overlapped_frames = latent_window_size * 4 - 3
530
-
531
- if forward:
532
- real_history_latents = history_latents[:, :, -min(section_latent_frames, total_generated_latent_frames):, :, :]
533
- history_pixels = soft_append_bcthw(history_pixels, vae_decode(real_history_latents, vae).cpu(), overlapped_frames)
534
- else:
535
- real_history_latents = history_latents[:, :, :min(section_latent_frames, total_generated_latent_frames), :, :]
536
- history_pixels = soft_append_bcthw(vae_decode(real_history_latents, vae).cpu(), history_pixels, overlapped_frames)
537
-
538
- if not high_vram:
539
- unload_complete_models(text_encoder, text_encoder_2, image_encoder, vae, transformer)
540
-
541
- if enable_preview or section_index == (0 if first_section_index == (total_latent_sections - 1) else (total_latent_sections - 1)):
542
- output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
543
-
544
- save_bcthw_as_mp4(history_pixels, output_filename, fps=fps_number, crf=mp4_crf)
545
-
546
- print(f'Decoded. Current latent shape pixel shape {history_pixels.shape}')
547
-
548
- stream.output_queue.push(('file', output_filename))
549
- return [total_generated_latent_frames, history_latents, history_pixels]
550
-
551
- while section_index < total_latent_sections:
552
- if stream.input_queue.top() == 'end':
553
- stream.output_queue.push(('end', None))
554
- return
555
-
556
- print(f'section_index = {section_index}, total_latent_sections = {total_latent_sections}')
557
-
558
- prompt_index = min(section_index, len(prompt_parameters) - 1)
559
 
560
- [llama_vec, clip_l_pooler, llama_vec_n, clip_l_pooler_n, llama_attention_mask, llama_attention_mask_n] = prompt_parameters[prompt_index]
561
 
562
- if prompt_index < len(prompt_parameters) - 1 or (prompt_index == total_latent_sections - 1):
563
- del prompt_parameters[prompt_index]
564
-
565
- if not high_vram:
566
- unload_complete_models()
567
- move_model_to_device_with_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=gpu_memory_preservation)
568
-
569
- if use_teacache:
570
- transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
571
  else:
572
- transformer.initialize_teacache(enable_teacache=False)
573
-
 
 
574
  if forward:
575
- clean_latents_4x, clean_latents_2x, clean_latents_1x = history_latents[:, :, -(16 + 2 + 1):, :, :].split([16, 2, 1], dim=2)
576
- clean_latents = torch.cat([start_latent, clean_latents_1x], dim=2)
577
  else:
578
- clean_latents_1x, clean_latents_2x, clean_latents_4x = history_latents[:, :, :(1 + 2 + 16), :, :].split([1, 2, 16], dim=2)
579
- clean_latents = torch.cat([clean_latents_1x, start_latent], dim=2)
580
-
581
- generated_latents = sample_hunyuan(
582
- transformer=transformer,
583
- sampler='unipc',
584
- width=width,
585
- height=height,
586
- frames=latent_window_size * 4 - 3,
587
- real_guidance_scale=cfg,
588
- distilled_guidance_scale=gs,
589
- guidance_rescale=rs,
590
- # shift=3.0,
591
- num_inference_steps=steps,
592
- generator=rnd,
593
- prompt_embeds=llama_vec,
594
- prompt_embeds_mask=llama_attention_mask,
595
- prompt_poolers=clip_l_pooler,
596
- negative_prompt_embeds=llama_vec_n,
597
- negative_prompt_embeds_mask=llama_attention_mask_n,
598
- negative_prompt_poolers=clip_l_pooler_n,
599
- device=gpu,
600
- dtype=torch.bfloat16,
601
- image_embeddings=image_encoder_last_hidden_state,
602
- latent_indices=latent_indices,
603
- clean_latents=clean_latents,
604
- clean_latent_indices=clean_latent_indices,
605
- clean_latents_2x=clean_latents_2x,
606
- clean_latent_2x_indices=clean_latent_2x_indices,
607
- clean_latents_4x=clean_latents_4x,
608
- clean_latent_4x_indices=clean_latent_4x_indices,
609
- callback=callback,
610
- )
611
- del clean_latents
612
- del clean_latents_2x
613
- del clean_latents_4x
614
- del latent_indices
615
- del clean_latent_indices
616
- del clean_latent_2x_indices
617
- del clean_latent_4x_indices
618
-
619
- [total_generated_latent_frames, history_latents, history_pixels] = post_process(forward, generated_latents, total_generated_latent_frames, history_latents, high_vram, transformer, gpu, vae, history_pixels, latent_window_size, enable_preview, section_index, total_latent_sections, outputs_folder, mp4_crf, stream)
620
-
621
- if not forward:
622
- if section_index > 0:
623
- section_index -= 1
624
  else:
625
- clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices = indices.split([1, 16, 2, 1, latent_window_size], dim=1)
626
- clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
627
-
628
- real_history_latents = history_latents[:, :, :total_generated_latent_frames, :, :]
629
- zero_latents = history_latents[:, :, total_generated_latent_frames:, :, :]
630
- history_latents = torch.cat([zero_latents, real_history_latents], dim=2)
631
- del real_history_latents
632
- del zero_latents
633
-
634
- forward = True
635
- section_index = first_section_index
636
 
637
- if forward:
638
- section_index += 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
639
  except:
640
  traceback.print_exc()
641
 
@@ -648,7 +653,7 @@ def worker(input_image, end_image, image_position, end_stillness, prompts, n_pro
648
  return
649
 
650
  @torch.no_grad()
651
- def worker_start_end(input_image, end_image, image_position, end_stillness, prompts, n_prompt, seed, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number):
652
  def encode_prompt(prompt, n_prompt):
653
  llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
654
 
@@ -668,8 +673,7 @@ def worker_start_end(input_image, end_image, image_position, end_stillness, prom
668
 
669
  total_latent_sections = (total_second_length * fps_number) / (latent_window_size * 4)
670
  total_latent_sections = int(max(round(total_latent_sections), 1))
671
-
672
- job_id = generate_timestamp()
673
 
674
  stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
675
 
@@ -729,9 +733,11 @@ def worker_start_end(input_image, end_image, image_position, end_stillness, prom
729
  load_model_as_complete(vae, target_device=gpu)
730
 
731
  start_latent = vae_encode(input_image_pt, vae)
 
732
 
733
  if has_end_image:
734
  end_latent = vae_encode(end_image_pt, vae)
 
735
 
736
  # CLIP Vision
737
  stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'CLIP Vision encoding ...'))))
@@ -740,6 +746,7 @@ def worker_start_end(input_image, end_image, image_position, end_stillness, prom
740
  load_model_as_complete(image_encoder, target_device=gpu)
741
 
742
  image_encoder_output = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder)
 
743
  image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
744
 
745
  if has_end_image:
@@ -763,163 +770,171 @@ def worker_start_end(input_image, end_image, image_position, end_stillness, prom
763
  # Dtype
764
  image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
765
 
766
- # Sampling
767
- stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Start sampling ...'))))
768
-
769
- rnd = torch.Generator("cpu").manual_seed(seed)
770
- num_frames = latent_window_size * 4 - 3
771
 
772
- history_latents = torch.zeros(size=(1, 16, 1 + 2 + 16, height // 8, width // 8), dtype=torch.float32, device=cpu)
773
- start_latent = start_latent.to(history_latents)
774
- if has_end_image:
775
- end_latent = end_latent.to(history_latents)
776
 
777
- history_pixels = None
778
- total_generated_latent_frames = 0
779
-
780
- if total_latent_sections > 4:
781
- # In theory the latent_paddings should follow the else sequence, but it seems that duplicating some
782
- # items looks better than expanding it when total_latent_sections > 4
783
- # One can try to remove below trick and just
784
- # use `latent_paddings = list(reversed(range(total_latent_sections)))` to compare
785
- latent_paddings = [3] + [2] * (total_latent_sections - 3) + [1, 0]
786
- else:
787
- # Convert an iterator to a list
788
- latent_paddings = list(range(total_latent_sections - 1, -1, -1))
789
-
790
- if enable_preview:
791
- def callback(d):
792
- preview = d['denoised']
793
- preview = vae_decode_fake(preview)
794
-
795
- preview = (preview * 255.0).detach().cpu().numpy().clip(0, 255).astype(np.uint8)
796
- preview = einops.rearrange(preview, 'b c t h w -> (b h) (t w) c')
797
-
798
- if stream.input_queue.top() == 'end':
799
- stream.output_queue.push(('end', None))
800
- raise KeyboardInterrupt('User ends the task.')
801
-
802
- current_step = d['i'] + 1
803
- percentage = int(100.0 * current_step / steps)
804
- hint = f'Sampling {current_step}/{steps}'
805
- desc = f'Total generated frames: {int(max(0, total_generated_latent_frames * 4 - 3))}, Video length: {max(0, (total_generated_latent_frames * 4 - 3) / fps_number) :.2f} seconds (FPS-30), Resolution: {height}px * {width}px. The video is being extended now ...'
806
- stream.output_queue.push(('progress', (preview, desc, make_progress_bar_html(percentage, hint))))
807
- return
808
- else:
809
- def callback(d):
810
- return
811
-
812
- def post_process(job_id, start_latent, generated_latents, total_generated_latent_frames, history_latents, high_vram, transformer, gpu, vae, history_pixels, latent_window_size, enable_preview, outputs_folder, mp4_crf, stream, is_last_section):
813
- if is_last_section:
814
- generated_latents = torch.cat([start_latent.to(generated_latents), generated_latents], dim=2)
815
 
816
- total_generated_latent_frames += int(generated_latents.shape[2])
817
- history_latents = torch.cat([generated_latents.to(history_latents), history_latents], dim=2)
818
 
819
- if not high_vram:
820
- offload_model_from_device_for_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=8)
821
- load_model_as_complete(vae, target_device=gpu)
 
 
 
 
822
 
823
- if history_pixels is None:
824
- history_pixels = vae_decode(history_latents[:, :, :total_generated_latent_frames, :, :], vae).cpu()
 
 
 
 
825
  else:
826
- section_latent_frames = (latent_window_size * 2 + 1) if is_last_section else (latent_window_size * 2)
827
- overlapped_frames = latent_window_size * 4 - 3
828
 
829
- current_pixels = vae_decode(history_latents[:, :, :min(total_generated_latent_frames, section_latent_frames)], vae).cpu()
830
- history_pixels = soft_append_bcthw(current_pixels, history_pixels, overlapped_frames)
831
-
832
- if not high_vram:
833
- unload_complete_models(vae)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
834
 
835
- if enable_preview or is_last_section:
836
- output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
 
837
 
838
- save_bcthw_as_mp4(history_pixels, output_filename, fps=fps_number, crf=mp4_crf)
 
839
 
840
- print(f'Decoded. Pixel shape {history_pixels.shape}')
 
 
841
 
842
- stream.output_queue.push(('file', output_filename))
 
 
 
 
843
 
844
- return [total_generated_latent_frames, history_latents, history_pixels]
845
-
846
- for latent_padding in latent_paddings:
847
- is_last_section = latent_padding == 0
848
- is_first_section = latent_padding == latent_paddings[0]
849
- latent_padding_size = latent_padding * latent_window_size
850
-
851
- if stream.input_queue.top() == 'end':
852
- stream.output_queue.push(('end', None))
853
- return
854
-
855
- print(f'latent_padding_size = {latent_padding_size}, is_last_section = {is_last_section}, is_first_section = {is_first_section}')
856
-
857
- if len(prompt_parameters) > 0:
858
- [llama_vec, clip_l_pooler, llama_vec_n, clip_l_pooler_n, llama_attention_mask, llama_attention_mask_n] = prompt_parameters.pop(len(prompt_parameters) - 1)
859
-
860
- indices = torch.arange(1 + latent_padding_size + latent_window_size + 1 + (end_stillness if is_first_section else 0) + 2 + 16).unsqueeze(0)
861
- clean_latent_indices_pre, blank_indices, latent_indices, clean_latent_indices_post, clean_latent_2x_indices, clean_latent_4x_indices = indices.split([1, latent_padding_size, latent_window_size, 1 + (end_stillness if is_first_section else 0), 2, 16], dim=1)
862
- clean_latent_indices = torch.cat([clean_latent_indices_pre, clean_latent_indices_post], dim=1)
863
 
864
- clean_latents_post, clean_latents_2x, clean_latents_4x = history_latents[:, :, :1 + 2 + 16, :, :].split([1, 2, 16], dim=2)
865
-
866
- # Use end image latent for the first section if provided
867
- if has_end_image and is_first_section:
868
- clean_latents_post = end_latent.expand(-1, -1, 1 + end_stillness, -1, -1)
869
-
870
- clean_latents = torch.cat([start_latent, clean_latents_post], dim=2)
871
-
872
- if not high_vram:
873
- unload_complete_models()
874
- move_model_to_device_with_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=gpu_memory_preservation)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
875
 
876
- if use_teacache:
877
- transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
878
- else:
879
- transformer.initialize_teacache(enable_teacache=False)
880
-
881
- generated_latents = sample_hunyuan(
882
- transformer=transformer,
883
- sampler='unipc',
884
- width=width,
885
- height=height,
886
- frames=num_frames,
887
- real_guidance_scale=cfg,
888
- distilled_guidance_scale=gs,
889
- guidance_rescale=rs,
890
- # shift=3.0,
891
- num_inference_steps=steps,
892
- generator=rnd,
893
- prompt_embeds=llama_vec,
894
- prompt_embeds_mask=llama_attention_mask,
895
- prompt_poolers=clip_l_pooler,
896
- negative_prompt_embeds=llama_vec_n,
897
- negative_prompt_embeds_mask=llama_attention_mask_n,
898
- negative_prompt_poolers=clip_l_pooler_n,
899
- device=gpu,
900
- dtype=torch.bfloat16,
901
- image_embeddings=image_encoder_last_hidden_state,
902
- latent_indices=latent_indices,
903
- clean_latents=clean_latents,
904
- clean_latent_indices=clean_latent_indices,
905
- clean_latents_2x=clean_latents_2x,
906
- clean_latent_2x_indices=clean_latent_2x_indices,
907
- clean_latents_4x=clean_latents_4x,
908
- clean_latent_4x_indices=clean_latent_4x_indices,
909
- callback=callback,
910
- )
911
- del clean_latents
912
- del clean_latents_2x
913
- del clean_latents_4x
914
- del latent_indices
915
- del clean_latent_indices
916
- del clean_latent_2x_indices
917
- del clean_latent_4x_indices
918
-
919
- [total_generated_latent_frames, history_latents, history_pixels] = post_process(job_id, start_latent, generated_latents, total_generated_latent_frames, history_latents, high_vram, transformer, gpu, vae, history_pixels, latent_window_size, enable_preview, outputs_folder, mp4_crf, stream, is_last_section)
920
-
921
- if is_last_section:
922
- break
923
  except:
924
  traceback.print_exc()
925
 
@@ -1116,7 +1131,6 @@ def worker_video(input_video, end_frame, end_stillness, prompts, n_prompt, seed,
1116
  if batch > 1:
1117
  print(f"Beginning video {idx+1} of {batch} with seed {seed} ")
1118
 
1119
- #job_id = generate_timestamp()
1120
  job_id = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")+f"_framepackf1-videoinput_{width}-{total_second_length}sec_seed-{seed}_steps-{steps}_distilled-{gs}_cfg-{cfg}" # 20250506 pftq: easier to read timestamp and filename
1121
 
1122
  # Sampling
@@ -1132,7 +1146,7 @@ def worker_video(input_video, end_frame, end_stillness, prompts, n_prompt, seed,
1132
 
1133
  # 20250509 Generate backwards with end frame for better end frame anchoring
1134
  if total_latent_sections > 4:
1135
- latent_paddings = [3] + [2] * (total_latent_sections - 3) + [1, 0]
1136
  else:
1137
  latent_paddings = list(reversed(range(total_latent_sections)))
1138
 
@@ -1253,30 +1267,33 @@ def worker_video(input_video, end_frame, end_stillness, prompts, n_prompt, seed,
1253
  stream.output_queue.push(('end', None))
1254
  return
1255
 
1256
- def get_duration(input_image, end_image, image_position, end_stillness, prompts, generation_mode, n_prompt, seed, resolution, total_second_length, allocation_time, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number):
1257
  return allocation_time
1258
 
1259
  @spaces.GPU(duration=get_duration)
1260
- def process_on_gpu(input_image, end_image, image_position, end_stillness, prompts, generation_mode, n_prompt, seed, resolution, total_second_length, allocation_time, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number
1261
  ):
1262
  start = time.time()
1263
  global stream
1264
  stream = AsyncStream()
1265
 
1266
- async_run(worker_start_end if generation_mode == "start_end" else worker, input_image, end_image, image_position, end_stillness, prompts, n_prompt, seed, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number)
1267
 
1268
  output_filename = None
 
1269
 
1270
  while True:
1271
  flag, data = stream.output_queue.next()
1272
 
1273
  if flag == 'file':
1274
  output_filename = data
1275
- yield gr.update(value=output_filename, label="Previewed Frames"), gr.skip(), gr.skip(), gr.skip(), gr.update(interactive=False), gr.update(interactive=True), gr.skip()
 
 
1276
 
1277
  if flag == 'progress':
1278
  preview, desc, html = data
1279
- yield gr.update(label="Previewed Frames"), gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True), gr.skip()
1280
 
1281
  if flag == 'end':
1282
  end = time.time()
@@ -1285,7 +1302,7 @@ def process_on_gpu(input_image, end_image, image_position, end_stillness, prompt
1285
  secondes = secondes - (minutes * 60)
1286
  hours = math.floor(minutes / 60)
1287
  minutes = minutes - (hours * 60)
1288
- yield gr.update(value=output_filename, label="Finished Frames"), gr.update(visible=False), gr.skip(), "The process has lasted " + \
1289
  ((str(hours) + " h, ") if hours != 0 else "") + \
1290
  ((str(minutes) + " min, ") if hours != 0 or minutes != 0 else "") + \
1291
  str(secondes) + " sec. " + \
@@ -1303,6 +1320,7 @@ def process(input_image,
1303
  seed=31337,
1304
  auto_allocation=True,
1305
  allocation_time=180,
 
1306
  resolution=640,
1307
  total_second_length=5,
1308
  latent_window_size=9,
@@ -1321,7 +1339,7 @@ def process(input_image,
1321
 
1322
  if torch.cuda.device_count() == 0:
1323
  gr.Warning('Set this space to GPU config to make it work.')
1324
- yield gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.update(visible = False)
1325
  return
1326
 
1327
  if randomize_seed:
@@ -1336,7 +1354,7 @@ def process(input_image,
1336
  assert input_image is not None, 'No input image!'
1337
  assert (generation_mode != "start_end") or end_image is not None, 'No end image!'
1338
 
1339
- yield gr.update(label="Previewed Frames"), None, '', '', gr.update(interactive=False), gr.update(interactive=True), gr.skip()
1340
 
1341
  gc.collect()
1342
  yield from process_on_gpu(input_image,
@@ -1347,6 +1365,7 @@ def process(input_image,
1347
  generation_mode,
1348
  n_prompt,
1349
  seed,
 
1350
  resolution,
1351
  total_second_length,
1352
  allocation_time,
@@ -1375,17 +1394,20 @@ def process_video_on_gpu(input_video, end_frame, end_stillness, prompts, n_promp
1375
  async_run(worker_video, input_video, end_frame, end_stillness, prompts, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch)
1376
 
1377
  output_filename = None
 
1378
 
1379
  while True:
1380
  flag, data = stream.output_queue.next()
1381
 
1382
  if flag == 'file':
1383
  output_filename = data
1384
- yield gr.update(value=output_filename, label="Previewed Frames"), gr.skip(), gr.skip(), gr.skip(), gr.update(interactive=False), gr.update(interactive=True), gr.skip()
 
 
1385
 
1386
  if flag == 'progress':
1387
  preview, desc, html = data
1388
- yield gr.update(label="Previewed Frames"), gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True), gr.skip() # 20250506 pftq: Keep refreshing the video in case it got hidden when the tab was in the background
1389
 
1390
  if flag == 'end':
1391
  end = time.time()
@@ -1394,7 +1416,7 @@ def process_video_on_gpu(input_video, end_frame, end_stillness, prompts, n_promp
1394
  secondes = secondes - (minutes * 60)
1395
  hours = math.floor(minutes / 60)
1396
  minutes = minutes - (hours * 60)
1397
- yield gr.update(value=output_filename, label="Finished Frames"), gr.update(visible=False), desc + \
1398
  " The process has lasted " + \
1399
  ((str(hours) + " h, ") if hours != 0 else "") + \
1400
  ((str(minutes) + " min, ") if hours != 0 or minutes != 0 else "") + \
@@ -1409,7 +1431,7 @@ def process_video(input_video, end_frame, end_stillness, prompt, n_prompt, rando
1409
 
1410
  if torch.cuda.device_count() == 0:
1411
  gr.Warning('Set this space to GPU config to make it work.')
1412
- yield gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.update(visible = False)
1413
  return
1414
 
1415
  if randomize_seed:
@@ -1420,7 +1442,7 @@ def process_video(input_video, end_frame, end_stillness, prompt, n_prompt, rando
1420
  # 20250506 pftq: Updated assertion for video input
1421
  assert input_video is not None, 'No input video!'
1422
 
1423
- yield gr.update(label="Previewed Frames"), None, '', '', gr.update(interactive=False), gr.update(interactive=True), gr.skip()
1424
 
1425
  # 20250507 pftq: Even the H100 needs offloading if the video dimensions are 720p or higher
1426
  if high_vram and (no_resize or resolution>640):
@@ -1535,7 +1557,7 @@ with block:
1535
  enable_preview = gr.Checkbox(label='Enable preview', value=True, info='Display a preview around each second generated but it costs 2 sec. for each second generated.')
1536
  use_teacache = gr.Checkbox(label='Use TeaCache', value=False, info='Faster speed and no break in brightness, but often makes hands and fingers slightly worse.')
1537
 
1538
- n_prompt = gr.Textbox(label="Negative Prompt", value="Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth", info='Requires using normal CFG (undistilled) instead of Distilled (set Distilled=1 and CFG > 1).')
1539
 
1540
  fps_number = gr.Slider(label="Frame per seconds", info="The model is trained for 30 fps so other fps may generate weird results", minimum=10, maximum=60, value=30, step=1)
1541
  end_stillness = gr.Slider(label="End stillness", minimum=0, maximum=100, value=0, step=1, info='0=Realistic end; >0=Matches exactly the end image (but the time seems to freeze)')
@@ -1548,7 +1570,7 @@ with block:
1548
  resolution = gr.Dropdown([
1549
  ["409,600 px (working)", 640],
1550
  ["451,584 px (working)", 672],
1551
- ["495,616 px (VRAM pb on HF)", 704],
1552
  ["589,824 px (not tested)", 768],
1553
  ["692,224 px (not tested)", 832],
1554
  ["746,496 px (not tested)", 864],
@@ -1576,7 +1598,7 @@ with block:
1576
  gpu_memory_preservation = gr.Slider(label="GPU Inference Preserved Memory (GB) (larger means slower)", minimum=6, maximum=128, value=6, step=0.1, info="Set this number to a larger value if you encounter OOM. Larger value causes slower speed.")
1577
 
1578
  mp4_crf = gr.Slider(label="MP4 Compression", minimum=0, maximum=100, value=16, step=1, info="Lower means better quality. 0 is uncompressed. Change to 16 if you get black outputs. ")
1579
- batch = gr.Slider(label="Batch Size (Number of Videos)", minimum=1, maximum=1000, value=1, step=1, info='Generate multiple videos each with a different seed.')
1580
  with gr.Row():
1581
  randomize_seed = gr.Checkbox(label='Randomize seed', value=True, info='If checked, the seed is always different')
1582
  seed = gr.Slider(label="Seed", minimum=0, maximum=np.iinfo(np.int32).max, step=1, randomize=True)
@@ -1586,12 +1608,21 @@ with block:
1586
 
1587
  with gr.Column():
1588
  warning = gr.HTML(elem_id="warning", value = "<center><big>Your computer must <u>not</u> enter into standby mode.</big><br/>On Chrome, you can force to keep a tab alive in <code>chrome://discards/</code></center>", visible = False)
1589
- result_video = gr.Video(label="Generated Frames", autoplay=True, show_share_button=False, height=512, loop=True)
1590
- preview_image = gr.Image(label="Next Latents", height=200, visible=False)
 
 
 
 
 
 
 
 
 
1591
  progress_desc = gr.Markdown('', elem_classes='no-generating-animation')
1592
  progress_bar = gr.HTML('', elem_classes='no-generating-animation')
1593
 
1594
- ips = [input_image, end_image, image_position, end_stillness, final_prompt, generation_mode, n_prompt, randomize_seed, seed, auto_allocation, allocation_time, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number]
1595
  ips_video = [input_video, end_image, end_stillness, final_prompt, n_prompt, randomize_seed, seed, auto_allocation, allocation_time, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch]
1596
 
1597
  gr.Examples(
@@ -1604,11 +1635,12 @@ with block:
1604
  1, # end_stillness
1605
  "Overcrowed street in Japan, photorealistic, realistic, intricate details, 8k, insanely detailed",
1606
  "text", # generation_mode
1607
- "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth", # n_prompt
1608
  True, # randomize_seed
1609
  42, # seed
1610
  True, # auto_allocation
1611
  180, # allocation_time
 
1612
  672, # resolution
1613
  1, # total_second_length
1614
  9, # latent_window_size
@@ -1626,7 +1658,7 @@ with block:
1626
  run_on_click = True,
1627
  fn = process,
1628
  inputs = ips,
1629
- outputs = [result_video, preview_image, progress_desc, progress_bar, start_button, end_button, warning],
1630
  cache_examples = False,
1631
  )
1632
 
@@ -1640,11 +1672,12 @@ with block:
1640
  1, # end_stillness
1641
  "A dolphin emerges from the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
1642
  "image", # generation_mode
1643
- "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth", # n_prompt
1644
  True, # randomize_seed
1645
  42, # seed
1646
  True, # auto_allocation
1647
  180, # allocation_time
 
1648
  672, # resolution
1649
  1, # total_second_length
1650
  9, # latent_window_size
@@ -1665,11 +1698,12 @@ with block:
1665
  1, # end_stillness
1666
  "A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The man talks and the woman listens; A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks, the man stops talking and the man listens; A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks and the man listens",
1667
  "image", # generation_mode
1668
- "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth", # n_prompt
1669
  True, # randomize_seed
1670
  42, # seed
1671
  True, # auto_allocation
1672
  180, # allocation_time
 
1673
  672, # resolution
1674
  2, # total_second_length
1675
  9, # latent_window_size
@@ -1690,11 +1724,12 @@ with block:
1690
  1, # end_stillness
1691
  "A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks and the man listens; A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The man talks, the woman stops talking and the woman listens A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The man talks and the woman listens",
1692
  "image", # generation_mode
1693
- "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth", # n_prompt
1694
  True, # randomize_seed
1695
  42, # seed
1696
  True, # auto_allocation
1697
  180, # allocation_time
 
1698
  672, # resolution
1699
  2, # total_second_length
1700
  9, # latent_window_size
@@ -1720,6 +1755,7 @@ with block:
1720
  42, # seed
1721
  True, # auto_allocation
1722
  180, # allocation_time
 
1723
  672, # resolution
1724
  1, # total_second_length
1725
  9, # latent_window_size
@@ -1740,11 +1776,12 @@ with block:
1740
  1, # end_stillness
1741
  "A building starting to explode, photorealistic, realisitc, 8k, insanely detailed",
1742
  "image", # generation_mode
1743
- "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth", # n_prompt
1744
  True, # randomize_seed
1745
  42, # seed
1746
  True, # auto_allocation
1747
  180, # allocation_time
 
1748
  672, # resolution
1749
  1, # total_second_length
1750
  9, # latent_window_size
@@ -1762,7 +1799,7 @@ with block:
1762
  run_on_click = True,
1763
  fn = process,
1764
  inputs = ips,
1765
- outputs = [result_video, preview_image, progress_desc, progress_bar, start_button, end_button, warning],
1766
  cache_examples = False,
1767
  )
1768
 
@@ -1776,11 +1813,12 @@ with block:
1776
  0, # end_stillness
1777
  "A woman jumps out of the train and arrives on the ground, viewed from the outside, photorealistic, realistic, amateur photography, midday, insanely detailed, 8k", # prompt
1778
  "start_end", # generation_mode
1779
- "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth, jumpcut, crossfader, crossfading", # n_prompt
1780
  True, # randomize_seed
1781
  42, # seed
1782
  True, # auto_allocation
1783
  180, # allocation_time
 
1784
  672, # resolution
1785
  1, # total_second_length
1786
  9, # latent_window_size
@@ -1798,7 +1836,7 @@ with block:
1798
  run_on_click = True,
1799
  fn = process,
1800
  inputs = ips,
1801
- outputs = [result_video, preview_image, progress_desc, progress_bar, start_button, end_button, warning],
1802
  cache_examples = False,
1803
  )
1804
 
@@ -1810,7 +1848,7 @@ with block:
1810
  None, # end_image
1811
  1, # end_stillness
1812
  "View of the sea as far as the eye can see, from the seaside, a piece of land is barely visible on the horizon at the middle, the sky is radiant, reflections of the sun in the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
1813
- "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth, jumpcut, crossfader, crossfading", # n_prompt
1814
  True, # randomize_seed
1815
  42, # seed
1816
  True, # auto_allocation
@@ -1836,7 +1874,7 @@ with block:
1836
  "./img_examples/Example1.png", # end_image
1837
  1, # end_stillness
1838
  "View of the sea as far as the eye can see, from the seaside, a piece of land is barely visible on the horizon at the middle, the sky is radiant, reflections of the sun in the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
1839
- "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth, jumpcut, crossfader, crossfading", # n_prompt
1840
  True, # randomize_seed
1841
  42, # seed
1842
  True, # auto_allocation
@@ -1861,7 +1899,7 @@ with block:
1861
  run_on_click = True,
1862
  fn = process_video,
1863
  inputs = ips_video,
1864
- outputs = [result_video, preview_image, progress_desc, progress_bar, start_button_video, end_button, warning],
1865
  cache_examples = False,
1866
  )
1867
 
@@ -1899,7 +1937,6 @@ with block:
1899
  gr.update(visible = True), # start_button
1900
  gr.update(visible = False), # start_button_video
1901
  gr.update(visible = False), # no_resize
1902
- gr.update(visible = False), # batch
1903
  gr.update(visible = False), # num_clean_frames
1904
  gr.update(visible = False), # vae_batch
1905
  gr.update(visible = False), # prompt_hint
@@ -1916,7 +1953,6 @@ with block:
1916
  gr.update(visible = True), # start_button
1917
  gr.update(visible = False), # start_button_video
1918
  gr.update(visible = False), # no_resize
1919
- gr.update(visible = False), # batch
1920
  gr.update(visible = False), # num_clean_frames
1921
  gr.update(visible = False), # vae_batch
1922
  gr.update(visible = False), # prompt_hint
@@ -1933,7 +1969,6 @@ with block:
1933
  gr.update(visible = True), # start_button
1934
  gr.update(visible = False), # start_button_video
1935
  gr.update(visible = False), # no_resize
1936
- gr.update(visible = False), # batch
1937
  gr.update(visible = False), # num_clean_frames
1938
  gr.update(visible = False), # vae_batch
1939
  gr.update(visible = False), # prompt_hint
@@ -1950,7 +1985,6 @@ with block:
1950
  gr.update(visible = False), # start_button
1951
  gr.update(visible = True), # start_button_video
1952
  gr.update(visible = True), # no_resize
1953
- gr.update(visible = True), # batch
1954
  gr.update(visible = True), # num_clean_frames
1955
  gr.update(visible = True), # vae_batch
1956
  gr.update(visible = True), # prompt_hint
@@ -1961,10 +1995,10 @@ with block:
1961
  timeless_prompt.change(fn=handle_timeless_prompt_change, inputs=[timeless_prompt], outputs=[final_prompt])
1962
  start_button.click(fn = check_parameters, inputs = [
1963
  generation_mode, input_image, input_video
1964
- ], outputs = [end_button, warning], queue = False, show_progress = False).success(fn=process, inputs=ips, outputs=[result_video, preview_image, progress_desc, progress_bar, start_button, end_button, warning], scroll_to_output = True)
1965
  start_button_video.click(fn = check_parameters, inputs = [
1966
  generation_mode, input_image, input_video
1967
- ], outputs = [end_button, warning], queue = False, show_progress = False).success(fn=process_video, inputs=ips_video, outputs=[result_video, preview_image, progress_desc, progress_bar, start_button_video, end_button, warning], scroll_to_output = True)
1968
  end_button.click(fn=end_process)
1969
 
1970
  generation_mode.change(fn = save_preferences, inputs = [
@@ -1977,7 +2011,7 @@ with block:
1977
  generation_mode.change(
1978
  fn=handle_generation_mode_change,
1979
  inputs=[generation_mode],
1980
- outputs=[text_to_video_hint, image_position, input_image, end_image, end_stillness, input_video, start_button, start_button_video, no_resize, batch, num_clean_frames, vae_batch, prompt_hint, fps_number]
1981
  )
1982
 
1983
  # Update display when the page loads
@@ -1985,7 +2019,7 @@ with block:
1985
  fn=handle_generation_mode_change, inputs = [
1986
  generation_mode
1987
  ], outputs = [
1988
- text_to_video_hint, image_position, input_image, end_image, end_stillness, input_video, start_button, start_button_video, no_resize, batch, num_clean_frames, vae_batch, prompt_hint, fps_number
1989
  ]
1990
  )
1991
 
 
41
  from diffusers import AutoencoderKLHunyuanVideo
42
  from transformers import LlamaModel, CLIPTextModel, LlamaTokenizerFast, CLIPTokenizer
43
  from diffusers_helper.hunyuan import encode_prompt_conds, vae_decode, vae_encode, vae_decode_fake
44
+ from diffusers_helper.utils import save_bcthw_as_mp4, crop_or_pad_yield_mask, soft_append_bcthw, resize_and_center_crop, state_dict_weighted_merge, state_dict_offset_merge
45
  from diffusers_helper.models.hunyuan_video_packed import HunyuanVideoTransformer3DModelPacked
46
  from diffusers_helper.pipelines.k_diffusion_hunyuan import sample_hunyuan
47
  if torch.cuda.device_count() > 0:
 
368
  raise
369
 
370
  @torch.no_grad()
371
+ def worker(input_image, end_image, image_position, end_stillness, prompts, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number):
372
  def encode_prompt(prompt, n_prompt):
373
  llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
374
 
 
393
  section_index = first_section_index
394
  forward = (image_position == 0)
395
 
 
 
396
  stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
397
 
398
  try:
 
468
 
469
  image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
470
 
471
+ for idx in range(batch):
472
+ if batch > 1:
473
+ print(f"Beginning video {idx+1} of {batch} with seed {seed} ")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
474
 
475
+ job_id = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")+f"_framepackf1_{width}-{total_second_length}sec_seed-{seed}_steps-{steps}_distilled-{gs}_cfg-{cfg}"
476
 
477
+ # Sampling
478
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Start sampling ...'))))
479
+
480
+ rnd = torch.Generator("cpu").manual_seed(seed)
481
+
482
+ history_latents = torch.zeros(size=(1, 16, 16 + 2 + 1, height // 8, width // 8), dtype=torch.float32, device=cpu)
483
+ start_latent = start_latent.to(history_latents)
484
+ history_pixels = None
485
+
486
+ history_latents = torch.cat([history_latents, start_latent] if forward else [start_latent, history_latents], dim=2)
487
+ total_generated_latent_frames = 1
488
+
489
+ if enable_preview:
490
+ def callback(d):
491
+ preview = d['denoised']
492
+ preview = vae_decode_fake(preview)
493
+
494
+ preview = (preview * 255.0).detach().cpu().numpy().clip(0, 255).astype(np.uint8)
495
+ preview = einops.rearrange(preview, 'b c t h w -> (b h) (t w) c')
496
+
497
+ if stream.input_queue.top() == 'end':
498
+ stream.output_queue.push(('end', None))
499
+ raise KeyboardInterrupt('User ends the task.')
500
+
501
+ current_step = d['i'] + 1
502
+ percentage = int(100.0 * current_step / steps)
503
+ hint = f'Sampling {current_step}/{steps}'
504
+ desc = f'Total generated frames: {int(max(0, total_generated_latent_frames * 4 - 3))}, Video length: {max(0, (total_generated_latent_frames * 4 - 3) / fps_number) :.2f} seconds (FPS-30), Resolution: {height}px * {width}px, Video {idx+1} of {batch}. The video is being extended now ...'
505
+ stream.output_queue.push(('progress', (preview, desc, make_progress_bar_html(percentage, hint))))
506
+ return
507
  else:
508
+ def callback(d):
509
+ return
510
+
511
+ indices = torch.arange(0, 1 + 16 + 2 + 1 + latent_window_size).unsqueeze(0)
512
  if forward:
513
+ clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices = indices.split([1, 16, 2, 1, latent_window_size], dim=1)
514
+ clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
515
  else:
516
+ latent_indices, clean_latent_1x_indices, clean_latent_2x_indices, clean_latent_4x_indices, clean_latent_indices_start = indices.split([latent_window_size, 1, 2, 16, 1], dim=1)
517
+ clean_latent_indices = torch.cat([clean_latent_1x_indices, clean_latent_indices_start], dim=1)
518
+
519
+ def post_process(forward, generated_latents, total_generated_latent_frames, history_latents, high_vram, transformer, gpu, vae, history_pixels, latent_window_size, enable_preview, section_index, total_latent_sections, outputs_folder, mp4_crf, stream):
520
+ total_generated_latent_frames += int(generated_latents.shape[2])
521
+ history_latents = torch.cat([history_latents, generated_latents.to(history_latents)] if forward else [generated_latents.to(history_latents), history_latents], dim=2)
522
+
523
+ if not high_vram:
524
+ offload_model_from_device_for_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=8)
525
+ load_model_as_complete(vae, target_device=gpu)
526
+
527
+ if history_pixels is None:
528
+ real_history_latents = history_latents[:, :, -total_generated_latent_frames:, :, :] if forward else history_latents[:, :, :total_generated_latent_frames, :, :]
529
+ history_pixels = vae_decode(real_history_latents, vae).cpu()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
530
  else:
531
+ section_latent_frames = latent_window_size * 2
532
+ overlapped_frames = latent_window_size * 4 - 3
 
 
 
 
 
 
 
 
 
533
 
534
+ if forward:
535
+ real_history_latents = history_latents[:, :, -min(section_latent_frames, total_generated_latent_frames):, :, :]
536
+ history_pixels = soft_append_bcthw(history_pixels, vae_decode(real_history_latents, vae).cpu(), overlapped_frames)
537
+ else:
538
+ real_history_latents = history_latents[:, :, :min(section_latent_frames, total_generated_latent_frames), :, :]
539
+ history_pixels = soft_append_bcthw(vae_decode(real_history_latents, vae).cpu(), history_pixels, overlapped_frames)
540
+
541
+ if not high_vram:
542
+ unload_complete_models(text_encoder, text_encoder_2, image_encoder, vae, transformer)
543
+
544
+ if enable_preview or section_index == (0 if first_section_index == (total_latent_sections - 1) else (total_latent_sections - 1)):
545
+ output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
546
+
547
+ save_bcthw_as_mp4(history_pixels, output_filename, fps=fps_number, crf=mp4_crf)
548
+
549
+ print(f'Decoded. Current latent shape pixel shape {history_pixels.shape}')
550
+
551
+ stream.output_queue.push(('file', output_filename))
552
+ return [total_generated_latent_frames, history_latents, history_pixels]
553
+
554
+ while section_index < total_latent_sections:
555
+ if stream.input_queue.top() == 'end':
556
+ stream.output_queue.push(('end', None))
557
+ return
558
+
559
+ print(f'section_index = {section_index}, total_latent_sections = {total_latent_sections}')
560
+
561
+ prompt_index = min(section_index, len(prompt_parameters) - 1)
562
+
563
+ [llama_vec, clip_l_pooler, llama_vec_n, clip_l_pooler_n, llama_attention_mask, llama_attention_mask_n] = prompt_parameters[prompt_index]
564
+
565
+ if prompt_index < len(prompt_parameters) - 1 or (prompt_index == total_latent_sections - 1):
566
+ del prompt_parameters[prompt_index]
567
+
568
+ if not high_vram:
569
+ unload_complete_models()
570
+ move_model_to_device_with_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=gpu_memory_preservation)
571
+
572
+ if use_teacache:
573
+ transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
574
+ else:
575
+ transformer.initialize_teacache(enable_teacache=False)
576
+
577
+ if forward:
578
+ clean_latents_4x, clean_latents_2x, clean_latents_1x = history_latents[:, :, -(16 + 2 + 1):, :, :].split([16, 2, 1], dim=2)
579
+ clean_latents = torch.cat([start_latent, clean_latents_1x], dim=2)
580
+ else:
581
+ clean_latents_1x, clean_latents_2x, clean_latents_4x = history_latents[:, :, :(1 + 2 + 16), :, :].split([1, 2, 16], dim=2)
582
+ clean_latents = torch.cat([clean_latents_1x, start_latent], dim=2)
583
+
584
+ generated_latents = sample_hunyuan(
585
+ transformer=transformer,
586
+ sampler='unipc',
587
+ width=width,
588
+ height=height,
589
+ frames=latent_window_size * 4 - 3,
590
+ real_guidance_scale=cfg,
591
+ distilled_guidance_scale=gs,
592
+ guidance_rescale=rs,
593
+ # shift=3.0,
594
+ num_inference_steps=steps,
595
+ generator=rnd,
596
+ prompt_embeds=llama_vec,
597
+ prompt_embeds_mask=llama_attention_mask,
598
+ prompt_poolers=clip_l_pooler,
599
+ negative_prompt_embeds=llama_vec_n,
600
+ negative_prompt_embeds_mask=llama_attention_mask_n,
601
+ negative_prompt_poolers=clip_l_pooler_n,
602
+ device=gpu,
603
+ dtype=torch.bfloat16,
604
+ image_embeddings=image_encoder_last_hidden_state,
605
+ latent_indices=latent_indices,
606
+ clean_latents=clean_latents,
607
+ clean_latent_indices=clean_latent_indices,
608
+ clean_latents_2x=clean_latents_2x,
609
+ clean_latent_2x_indices=clean_latent_2x_indices,
610
+ clean_latents_4x=clean_latents_4x,
611
+ clean_latent_4x_indices=clean_latent_4x_indices,
612
+ callback=callback,
613
+ )
614
+ del clean_latents
615
+ del clean_latents_2x
616
+ del clean_latents_4x
617
+ del latent_indices
618
+ del clean_latent_indices
619
+ del clean_latent_2x_indices
620
+ del clean_latent_4x_indices
621
+
622
+ [total_generated_latent_frames, history_latents, history_pixels] = post_process(forward, generated_latents, total_generated_latent_frames, history_latents, high_vram, transformer, gpu, vae, history_pixels, latent_window_size, enable_preview, section_index, total_latent_sections, outputs_folder, mp4_crf, stream)
623
+
624
+ if not forward:
625
+ if section_index > 0:
626
+ section_index -= 1
627
+ else:
628
+ clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices = indices.split([1, 16, 2, 1, latent_window_size], dim=1)
629
+ clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
630
+
631
+ real_history_latents = history_latents[:, :, :total_generated_latent_frames, :, :]
632
+ zero_latents = history_latents[:, :, total_generated_latent_frames:, :, :]
633
+ history_latents = torch.cat([zero_latents, real_history_latents], dim=2)
634
+ del real_history_latents
635
+ del zero_latents
636
+
637
+ forward = True
638
+ section_index = first_section_index
639
+
640
+ if forward:
641
+ section_index += 1
642
+
643
+ seed = (seed + 1) % np.iinfo(np.int32).max
644
  except:
645
  traceback.print_exc()
646
 
 
653
  return
654
 
655
  @torch.no_grad()
656
+ def worker_start_end(input_image, end_image, image_position, end_stillness, prompts, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number):
657
  def encode_prompt(prompt, n_prompt):
658
  llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
659
 
 
673
 
674
  total_latent_sections = (total_second_length * fps_number) / (latent_window_size * 4)
675
  total_latent_sections = int(max(round(total_latent_sections), 1))
676
+
 
677
 
678
  stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
679
 
 
733
  load_model_as_complete(vae, target_device=gpu)
734
 
735
  start_latent = vae_encode(input_image_pt, vae)
736
+ del input_image_pt
737
 
738
  if has_end_image:
739
  end_latent = vae_encode(end_image_pt, vae)
740
+ del end_image_pt
741
 
742
  # CLIP Vision
743
  stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'CLIP Vision encoding ...'))))
 
746
  load_model_as_complete(image_encoder, target_device=gpu)
747
 
748
  image_encoder_output = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder)
749
+ del input_image_np
750
  image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
751
 
752
  if has_end_image:
 
770
  # Dtype
771
  image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
772
 
773
+ for idx in range(batch):
774
+ if batch > 1:
775
+ print(f"Beginning video {idx+1} of {batch} with seed {seed} ")
 
 
776
 
777
+ job_id = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")+f"_framepackse_{width}-{total_second_length}sec_seed-{seed}_steps-{steps}_distilled-{gs}_cfg-{cfg}"
 
 
 
778
 
779
+ # Sampling
780
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Start sampling ...'))))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
781
 
782
+ rnd = torch.Generator("cpu").manual_seed(seed)
783
+ num_frames = latent_window_size * 4 - 3
784
 
785
+ history_latents = torch.zeros(size=(1, 16, 1 + 2 + 16, height // 8, width // 8), dtype=torch.float32, device=cpu)
786
+ start_latent = start_latent.to(history_latents)
787
+ if has_end_image:
788
+ end_latent = end_latent.to(history_latents)
789
+
790
+ history_pixels = None
791
+ total_generated_latent_frames = 0
792
 
793
+ if total_latent_sections > 4:
794
+ # In theory the latent_paddings should follow the else sequence, but it seems that duplicating some
795
+ # items looks better than expanding it when total_latent_sections > 4
796
+ # One can try to remove below trick and just
797
+ # use `latent_paddings = list(reversed(range(total_latent_sections)))` to compare
798
+ latent_paddings = [3] + [2] * (total_latent_sections - 3) + [1, 0]
799
  else:
800
+ # Convert an iterator to a list
801
+ latent_paddings = list(range(total_latent_sections - 1, -1, -1))
802
 
803
+ if enable_preview:
804
+ def callback(d):
805
+ preview = d['denoised']
806
+ preview = vae_decode_fake(preview)
807
+
808
+ preview = (preview * 255.0).detach().cpu().numpy().clip(0, 255).astype(np.uint8)
809
+ preview = einops.rearrange(preview, 'b c t h w -> (b h) (t w) c')
810
+
811
+ if stream.input_queue.top() == 'end':
812
+ stream.output_queue.push(('end', None))
813
+ raise KeyboardInterrupt('User ends the task.')
814
+
815
+ current_step = d['i'] + 1
816
+ percentage = int(100.0 * current_step / steps)
817
+ hint = f'Sampling {current_step}/{steps}'
818
+ desc = f'Total generated frames: {int(max(0, total_generated_latent_frames * 4 - 3))}, Video length: {max(0, (total_generated_latent_frames * 4 - 3) / fps_number) :.2f} seconds (FPS-30), Resolution: {height}px * {width}px, Video {idx+1} of {batch}. The video is being extended now ...'
819
+ stream.output_queue.push(('progress', (preview, desc, make_progress_bar_html(percentage, hint))))
820
+ return
821
+ else:
822
+ def callback(d):
823
+ return
824
 
825
+ def post_process(job_id, start_latent, generated_latents, total_generated_latent_frames, history_latents, high_vram, transformer, gpu, vae, history_pixels, latent_window_size, enable_preview, outputs_folder, mp4_crf, stream, is_last_section):
826
+ if is_last_section:
827
+ generated_latents = torch.cat([start_latent.to(generated_latents), generated_latents], dim=2)
828
 
829
+ total_generated_latent_frames += int(generated_latents.shape[2])
830
+ history_latents = torch.cat([generated_latents.to(history_latents), history_latents], dim=2)
831
 
832
+ if not high_vram:
833
+ offload_model_from_device_for_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=8)
834
+ load_model_as_complete(vae, target_device=gpu)
835
 
836
+ if history_pixels is None:
837
+ history_pixels = vae_decode(history_latents[:, :, :total_generated_latent_frames, :, :], vae).cpu()
838
+ else:
839
+ section_latent_frames = (latent_window_size * 2 + 1) if is_last_section else (latent_window_size * 2)
840
+ overlapped_frames = latent_window_size * 4 - 3
841
 
842
+ current_pixels = vae_decode(history_latents[:, :, :min(total_generated_latent_frames, section_latent_frames)], vae).cpu()
843
+ history_pixels = soft_append_bcthw(current_pixels, history_pixels, overlapped_frames)
844
+
845
+ if not high_vram:
846
+ unload_complete_models(vae)
847
+
848
+ if enable_preview or is_last_section:
849
+ output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
850
+
851
+ save_bcthw_as_mp4(history_pixels, output_filename, fps=fps_number, crf=mp4_crf)
852
+
853
+ print(f'Decoded. Pixel shape {history_pixels.shape}')
854
+
855
+ stream.output_queue.push(('file', output_filename))
856
+
857
+ return [total_generated_latent_frames, history_latents, history_pixels]
 
 
 
858
 
859
+ for latent_padding in latent_paddings:
860
+ is_last_section = latent_padding == 0
861
+ is_first_section = latent_padding == latent_paddings[0]
862
+ latent_padding_size = latent_padding * latent_window_size
863
+
864
+ if stream.input_queue.top() == 'end':
865
+ stream.output_queue.push(('end', None))
866
+ return
867
+
868
+ print(f'latent_padding_size = {latent_padding_size}, is_last_section = {is_last_section}, is_first_section = {is_first_section}')
869
+
870
+ if len(prompt_parameters) > 0:
871
+ [llama_vec, clip_l_pooler, llama_vec_n, clip_l_pooler_n, llama_attention_mask, llama_attention_mask_n] = prompt_parameters.pop(len(prompt_parameters) - 1)
872
+
873
+ indices = torch.arange(1 + latent_padding_size + latent_window_size + 1 + (end_stillness if is_first_section else 0) + 2 + 16).unsqueeze(0)
874
+ clean_latent_indices_pre, blank_indices, latent_indices, clean_latent_indices_post, clean_latent_2x_indices, clean_latent_4x_indices = indices.split([1, latent_padding_size, latent_window_size, 1 + (end_stillness if is_first_section else 0), 2, 16], dim=1)
875
+ clean_latent_indices = torch.cat([clean_latent_indices_pre, clean_latent_indices_post], dim=1)
876
+
877
+ clean_latents_post, clean_latents_2x, clean_latents_4x = history_latents[:, :, :1 + 2 + 16, :, :].split([1, 2, 16], dim=2)
878
+
879
+ # Use end image latent for the first section if provided
880
+ if has_end_image and is_first_section:
881
+ clean_latents_post = end_latent.expand(-1, -1, 1 + end_stillness, -1, -1)
882
+
883
+ clean_latents = torch.cat([start_latent, clean_latents_post], dim=2)
884
+
885
+ if not high_vram:
886
+ unload_complete_models()
887
+ move_model_to_device_with_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=gpu_memory_preservation)
888
+
889
+ if use_teacache:
890
+ transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
891
+ else:
892
+ transformer.initialize_teacache(enable_teacache=False)
893
+
894
+ generated_latents = sample_hunyuan(
895
+ transformer=transformer,
896
+ sampler='unipc',
897
+ width=width,
898
+ height=height,
899
+ frames=num_frames,
900
+ real_guidance_scale=cfg,
901
+ distilled_guidance_scale=gs,
902
+ guidance_rescale=rs,
903
+ # shift=3.0,
904
+ num_inference_steps=steps,
905
+ generator=rnd,
906
+ prompt_embeds=llama_vec,
907
+ prompt_embeds_mask=llama_attention_mask,
908
+ prompt_poolers=clip_l_pooler,
909
+ negative_prompt_embeds=llama_vec_n,
910
+ negative_prompt_embeds_mask=llama_attention_mask_n,
911
+ negative_prompt_poolers=clip_l_pooler_n,
912
+ device=gpu,
913
+ dtype=torch.bfloat16,
914
+ image_embeddings=image_encoder_last_hidden_state,
915
+ latent_indices=latent_indices,
916
+ clean_latents=clean_latents,
917
+ clean_latent_indices=clean_latent_indices,
918
+ clean_latents_2x=clean_latents_2x,
919
+ clean_latent_2x_indices=clean_latent_2x_indices,
920
+ clean_latents_4x=clean_latents_4x,
921
+ clean_latent_4x_indices=clean_latent_4x_indices,
922
+ callback=callback,
923
+ )
924
+ del clean_latents
925
+ del clean_latents_2x
926
+ del clean_latents_4x
927
+ del latent_indices
928
+ del clean_latent_indices
929
+ del clean_latent_2x_indices
930
+ del clean_latent_4x_indices
931
+
932
+ [total_generated_latent_frames, history_latents, history_pixels] = post_process(job_id, start_latent, generated_latents, total_generated_latent_frames, history_latents, high_vram, transformer, gpu, vae, history_pixels, latent_window_size, enable_preview, outputs_folder, mp4_crf, stream, is_last_section)
933
+
934
+ if is_last_section:
935
+ break
936
 
937
+ seed = (seed + 1) % np.iinfo(np.int32).max
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
938
  except:
939
  traceback.print_exc()
940
 
 
1131
  if batch > 1:
1132
  print(f"Beginning video {idx+1} of {batch} with seed {seed} ")
1133
 
 
1134
  job_id = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")+f"_framepackf1-videoinput_{width}-{total_second_length}sec_seed-{seed}_steps-{steps}_distilled-{gs}_cfg-{cfg}" # 20250506 pftq: easier to read timestamp and filename
1135
 
1136
  # Sampling
 
1146
 
1147
  # 20250509 Generate backwards with end frame for better end frame anchoring
1148
  if total_latent_sections > 4:
1149
+ latent_paddings = [3, 2] + [1] * (total_latent_sections - 3) + [0]
1150
  else:
1151
  latent_paddings = list(reversed(range(total_latent_sections)))
1152
 
 
1267
  stream.output_queue.push(('end', None))
1268
  return
1269
 
1270
+ def get_duration(input_image, end_image, image_position, end_stillness, prompts, generation_mode, n_prompt, seed, batch, resolution, total_second_length, allocation_time, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number):
1271
  return allocation_time
1272
 
1273
  @spaces.GPU(duration=get_duration)
1274
+ def process_on_gpu(input_image, end_image, image_position, end_stillness, prompts, generation_mode, n_prompt, seed, batch, resolution, total_second_length, allocation_time, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number
1275
  ):
1276
  start = time.time()
1277
  global stream
1278
  stream = AsyncStream()
1279
 
1280
+ async_run(worker_start_end if generation_mode == "start_end" else worker, input_image, end_image, image_position, end_stillness, prompts, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number)
1281
 
1282
  output_filename = None
1283
+ output_filenames = ""
1284
 
1285
  while True:
1286
  flag, data = stream.output_queue.next()
1287
 
1288
  if flag == 'file':
1289
  output_filename = data
1290
+ output_filenames = output_filenames + ";" + str(output_filename)
1291
+ print("output_filename=" + str(output_filename))
1292
+ yield gr.update(value=output_filename, label="Previewed Frames"), gr.update(value=output_filenames, visible=True), gr.skip(), gr.skip(), gr.skip(), gr.update(interactive=False), gr.update(interactive=True), gr.skip()
1293
 
1294
  if flag == 'progress':
1295
  preview, desc, html = data
1296
+ yield gr.update(label="Previewed Frames"), gr.skip(), gr.skip(), gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True), gr.skip()
1297
 
1298
  if flag == 'end':
1299
  end = time.time()
 
1302
  secondes = secondes - (minutes * 60)
1303
  hours = math.floor(minutes / 60)
1304
  minutes = minutes - (hours * 60)
1305
+ yield gr.update(value=output_filename, label="Finished Frames"), gr.update(value=output_filenames, visible=True), gr.update(visible=False), gr.skip(), "The process has lasted " + \
1306
  ((str(hours) + " h, ") if hours != 0 else "") + \
1307
  ((str(minutes) + " min, ") if hours != 0 or minutes != 0 else "") + \
1308
  str(secondes) + " sec. " + \
 
1320
  seed=31337,
1321
  auto_allocation=True,
1322
  allocation_time=180,
1323
+ batch=1,
1324
  resolution=640,
1325
  total_second_length=5,
1326
  latent_window_size=9,
 
1339
 
1340
  if torch.cuda.device_count() == 0:
1341
  gr.Warning('Set this space to GPU config to make it work.')
1342
+ yield gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.update(visible = False)
1343
  return
1344
 
1345
  if randomize_seed:
 
1354
  assert input_image is not None, 'No input image!'
1355
  assert (generation_mode != "start_end") or end_image is not None, 'No end image!'
1356
 
1357
+ yield gr.update(label="Previewed Frames"), gr.update(value = ""), None, '', '', gr.update(interactive=False), gr.update(interactive=True), gr.skip()
1358
 
1359
  gc.collect()
1360
  yield from process_on_gpu(input_image,
 
1365
  generation_mode,
1366
  n_prompt,
1367
  seed,
1368
+ batch,
1369
  resolution,
1370
  total_second_length,
1371
  allocation_time,
 
1394
  async_run(worker_video, input_video, end_frame, end_stillness, prompts, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch)
1395
 
1396
  output_filename = None
1397
+ output_filenames = ""
1398
 
1399
  while True:
1400
  flag, data = stream.output_queue.next()
1401
 
1402
  if flag == 'file':
1403
  output_filename = data
1404
+ print("output_filename=" + str(output_filename))
1405
+ output_filenames = output_filenames + ";" + str(output_filename)
1406
+ yield gr.update(value=output_filename, label="Previewed Frames"), gr.update(value=output_filenames, visible=True), gr.skip(), gr.skip(), gr.skip(), gr.update(interactive=False), gr.update(interactive=True), gr.skip()
1407
 
1408
  if flag == 'progress':
1409
  preview, desc, html = data
1410
+ yield gr.update(label="Previewed Frames"), gr.skip(), gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True), gr.skip() # 20250506 pftq: Keep refreshing the video in case it got hidden when the tab was in the background
1411
 
1412
  if flag == 'end':
1413
  end = time.time()
 
1416
  secondes = secondes - (minutes * 60)
1417
  hours = math.floor(minutes / 60)
1418
  minutes = minutes - (hours * 60)
1419
+ yield gr.update(value=output_filename, label="Finished Frames"), gr.update(value=output_filenames, visible=True), gr.update(visible=False), desc + \
1420
  " The process has lasted " + \
1421
  ((str(hours) + " h, ") if hours != 0 else "") + \
1422
  ((str(minutes) + " min, ") if hours != 0 or minutes != 0 else "") + \
 
1431
 
1432
  if torch.cuda.device_count() == 0:
1433
  gr.Warning('Set this space to GPU config to make it work.')
1434
+ yield gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.update(visible = False)
1435
  return
1436
 
1437
  if randomize_seed:
 
1442
  # 20250506 pftq: Updated assertion for video input
1443
  assert input_video is not None, 'No input video!'
1444
 
1445
+ yield gr.update(label="Previewed Frames"), gr.update(value = ""), None, '', '', gr.update(interactive=False), gr.update(interactive=True), gr.skip()
1446
 
1447
  # 20250507 pftq: Even the H100 needs offloading if the video dimensions are 720p or higher
1448
  if high_vram and (no_resize or resolution>640):
 
1557
  enable_preview = gr.Checkbox(label='Enable preview', value=True, info='Display a preview around each second generated but it costs 2 sec. for each second generated.')
1558
  use_teacache = gr.Checkbox(label='Use TeaCache', value=False, info='Faster speed and no break in brightness, but often makes hands and fingers slightly worse.')
1559
 
1560
+ n_prompt = gr.Textbox(label="Negative Prompt", value="Missing arm, long hand, sagging skin, unjustified body movement, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth, abrupt change in camera trajectory, jumpcut, crossfader, crossfading", info='Requires using normal CFG (undistilled) instead of Distilled (set Distilled=1 and CFG > 1).')
1561
 
1562
  fps_number = gr.Slider(label="Frame per seconds", info="The model is trained for 30 fps so other fps may generate weird results", minimum=10, maximum=60, value=30, step=1)
1563
  end_stillness = gr.Slider(label="End stillness", minimum=0, maximum=100, value=0, step=1, info='0=Realistic end; >0=Matches exactly the end image (but the time seems to freeze)')
 
1570
  resolution = gr.Dropdown([
1571
  ["409,600 px (working)", 640],
1572
  ["451,584 px (working)", 672],
1573
+ ["495,616 px (working for extension)", 704],
1574
  ["589,824 px (not tested)", 768],
1575
  ["692,224 px (not tested)", 832],
1576
  ["746,496 px (not tested)", 864],
 
1598
  gpu_memory_preservation = gr.Slider(label="GPU Inference Preserved Memory (GB) (larger means slower)", minimum=6, maximum=128, value=6, step=0.1, info="Set this number to a larger value if you encounter OOM. Larger value causes slower speed.")
1599
 
1600
  mp4_crf = gr.Slider(label="MP4 Compression", minimum=0, maximum=100, value=16, step=1, info="Lower means better quality. 0 is uncompressed. Change to 16 if you get black outputs. ")
1601
+ batch = gr.Slider(label="Batch Size (number of videos)", minimum=1, maximum=1000, value=1, step=1, info='Generate multiple videos each with a different seed')
1602
  with gr.Row():
1603
  randomize_seed = gr.Checkbox(label='Randomize seed', value=True, info='If checked, the seed is always different')
1604
  seed = gr.Slider(label="Seed", minimum=0, maximum=np.iinfo(np.int32).max, step=1, randomize=True)
 
1608
 
1609
  with gr.Column():
1610
  warning = gr.HTML(elem_id="warning", value = "<center><big>Your computer must <u>not</u> enter into standby mode.</big><br/>On Chrome, you can force to keep a tab alive in <code>chrome://discards/</code></center>", visible = False)
1611
+ result_video = gr.Video(label="Generated Frames", autoplay = True, show_share_button = False, height = 512, loop = True)
1612
+ download_textbox = gr.HTML(label="Download list", visible = False)
1613
+
1614
+ @gr.render(inputs=download_textbox)
1615
+ def show_split(download_textbox):
1616
+ if len(download_textbox) > 0:
1617
+ pathes = download_textbox.split(";")[1:]
1618
+ for one_path in pathes:
1619
+ one_download_button = gr.DownloadButton(label="Download", value=one_path)
1620
+
1621
+ preview_image = gr.Image(label="Next Latents", height = 200, visible = False)
1622
  progress_desc = gr.Markdown('', elem_classes='no-generating-animation')
1623
  progress_bar = gr.HTML('', elem_classes='no-generating-animation')
1624
 
1625
+ ips = [input_image, end_image, image_position, end_stillness, final_prompt, generation_mode, n_prompt, randomize_seed, seed, auto_allocation, allocation_time, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number]
1626
  ips_video = [input_video, end_image, end_stillness, final_prompt, n_prompt, randomize_seed, seed, auto_allocation, allocation_time, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch]
1627
 
1628
  gr.Examples(
 
1635
  1, # end_stillness
1636
  "Overcrowed street in Japan, photorealistic, realistic, intricate details, 8k, insanely detailed",
1637
  "text", # generation_mode
1638
+ "Missing arm, long hand, sagging skin, unjustified body movement, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth, abrupt change in camera trajectory, jumpcut, crossfader, crossfading", # n_prompt
1639
  True, # randomize_seed
1640
  42, # seed
1641
  True, # auto_allocation
1642
  180, # allocation_time
1643
+ 1, # batch
1644
  672, # resolution
1645
  1, # total_second_length
1646
  9, # latent_window_size
 
1658
  run_on_click = True,
1659
  fn = process,
1660
  inputs = ips,
1661
+ outputs = [result_video, download_textbox, preview_image, progress_desc, progress_bar, start_button, end_button, warning],
1662
  cache_examples = False,
1663
  )
1664
 
 
1672
  1, # end_stillness
1673
  "A dolphin emerges from the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
1674
  "image", # generation_mode
1675
+ "Missing arm, long hand, sagging skin, unjustified body movement, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth, abrupt change in camera trajectory, jumpcut, crossfader, crossfading", # n_prompt
1676
  True, # randomize_seed
1677
  42, # seed
1678
  True, # auto_allocation
1679
  180, # allocation_time
1680
+ 1, # batch
1681
  672, # resolution
1682
  1, # total_second_length
1683
  9, # latent_window_size
 
1698
  1, # end_stillness
1699
  "A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The man talks and the woman listens; A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks, the man stops talking and the man listens; A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks and the man listens",
1700
  "image", # generation_mode
1701
+ "Missing arm, long hand, sagging skin, unjustified body movement, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth, abrupt change in camera trajectory, jumpcut, crossfader, crossfading", # n_prompt
1702
  True, # randomize_seed
1703
  42, # seed
1704
  True, # auto_allocation
1705
  180, # allocation_time
1706
+ 1, # batch
1707
  672, # resolution
1708
  2, # total_second_length
1709
  9, # latent_window_size
 
1724
  1, # end_stillness
1725
  "A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks and the man listens; A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The man talks, the woman stops talking and the woman listens A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The man talks and the woman listens",
1726
  "image", # generation_mode
1727
+ "Missing arm, long hand, sagging skin, unjustified body movement, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth, abrupt change in camera trajectory, jumpcut, crossfader, crossfading", # n_prompt
1728
  True, # randomize_seed
1729
  42, # seed
1730
  True, # auto_allocation
1731
  180, # allocation_time
1732
+ 1, # batch
1733
  672, # resolution
1734
  2, # total_second_length
1735
  9, # latent_window_size
 
1755
  42, # seed
1756
  True, # auto_allocation
1757
  180, # allocation_time
1758
+ 1, # batch
1759
  672, # resolution
1760
  1, # total_second_length
1761
  9, # latent_window_size
 
1776
  1, # end_stillness
1777
  "A building starting to explode, photorealistic, realisitc, 8k, insanely detailed",
1778
  "image", # generation_mode
1779
+ "Missing arm, long hand, sagging skin, unjustified body movement, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth, abrupt change in camera trajectory, jumpcut, crossfader, crossfading", # n_prompt
1780
  True, # randomize_seed
1781
  42, # seed
1782
  True, # auto_allocation
1783
  180, # allocation_time
1784
+ 1, # batch
1785
  672, # resolution
1786
  1, # total_second_length
1787
  9, # latent_window_size
 
1799
  run_on_click = True,
1800
  fn = process,
1801
  inputs = ips,
1802
+ outputs = [result_video, download_textbox, preview_image, progress_desc, progress_bar, start_button, end_button, warning],
1803
  cache_examples = False,
1804
  )
1805
 
 
1813
  0, # end_stillness
1814
  "A woman jumps out of the train and arrives on the ground, viewed from the outside, photorealistic, realistic, amateur photography, midday, insanely detailed, 8k", # prompt
1815
  "start_end", # generation_mode
1816
+ "Missing arm, long hand, sagging skin, unjustified body movement, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth, abrupt change in camera trajectory, jumpcut, crossfader, crossfading", # n_prompt
1817
  True, # randomize_seed
1818
  42, # seed
1819
  True, # auto_allocation
1820
  180, # allocation_time
1821
+ 1, # batch
1822
  672, # resolution
1823
  1, # total_second_length
1824
  9, # latent_window_size
 
1836
  run_on_click = True,
1837
  fn = process,
1838
  inputs = ips,
1839
+ outputs = [result_video, download_textbox, preview_image, progress_desc, progress_bar, start_button, end_button, warning],
1840
  cache_examples = False,
1841
  )
1842
 
 
1848
  None, # end_image
1849
  1, # end_stillness
1850
  "View of the sea as far as the eye can see, from the seaside, a piece of land is barely visible on the horizon at the middle, the sky is radiant, reflections of the sun in the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
1851
+ "Missing arm, long hand, sagging skin, unjustified body movement, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth, abrupt change in camera trajectory, jumpcut, crossfader, crossfading", # n_prompt
1852
  True, # randomize_seed
1853
  42, # seed
1854
  True, # auto_allocation
 
1874
  "./img_examples/Example1.png", # end_image
1875
  1, # end_stillness
1876
  "View of the sea as far as the eye can see, from the seaside, a piece of land is barely visible on the horizon at the middle, the sky is radiant, reflections of the sun in the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
1877
+ "Missing arm, long hand, sagging skin, unjustified body movement, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth, abrupt change in camera trajectory, jumpcut, crossfader, crossfading", # n_prompt
1878
  True, # randomize_seed
1879
  42, # seed
1880
  True, # auto_allocation
 
1899
  run_on_click = True,
1900
  fn = process_video,
1901
  inputs = ips_video,
1902
+ outputs = [result_video, download_textbox, preview_image, progress_desc, progress_bar, start_button_video, end_button, warning],
1903
  cache_examples = False,
1904
  )
1905
 
 
1937
  gr.update(visible = True), # start_button
1938
  gr.update(visible = False), # start_button_video
1939
  gr.update(visible = False), # no_resize
 
1940
  gr.update(visible = False), # num_clean_frames
1941
  gr.update(visible = False), # vae_batch
1942
  gr.update(visible = False), # prompt_hint
 
1953
  gr.update(visible = True), # start_button
1954
  gr.update(visible = False), # start_button_video
1955
  gr.update(visible = False), # no_resize
 
1956
  gr.update(visible = False), # num_clean_frames
1957
  gr.update(visible = False), # vae_batch
1958
  gr.update(visible = False), # prompt_hint
 
1969
  gr.update(visible = True), # start_button
1970
  gr.update(visible = False), # start_button_video
1971
  gr.update(visible = False), # no_resize
 
1972
  gr.update(visible = False), # num_clean_frames
1973
  gr.update(visible = False), # vae_batch
1974
  gr.update(visible = False), # prompt_hint
 
1985
  gr.update(visible = False), # start_button
1986
  gr.update(visible = True), # start_button_video
1987
  gr.update(visible = True), # no_resize
 
1988
  gr.update(visible = True), # num_clean_frames
1989
  gr.update(visible = True), # vae_batch
1990
  gr.update(visible = True), # prompt_hint
 
1995
  timeless_prompt.change(fn=handle_timeless_prompt_change, inputs=[timeless_prompt], outputs=[final_prompt])
1996
  start_button.click(fn = check_parameters, inputs = [
1997
  generation_mode, input_image, input_video
1998
+ ], outputs = [end_button, warning], queue = False, show_progress = False).success(fn=process, inputs=ips, outputs=[result_video, download_textbox, preview_image, progress_desc, progress_bar, start_button, end_button, warning], scroll_to_output = True)
1999
  start_button_video.click(fn = check_parameters, inputs = [
2000
  generation_mode, input_image, input_video
2001
+ ], outputs = [end_button, warning], queue = False, show_progress = False).success(fn=process_video, inputs=ips_video, outputs=[result_video, download_textbox, preview_image, progress_desc, progress_bar, start_button_video, end_button, warning], scroll_to_output = True)
2002
  end_button.click(fn=end_process)
2003
 
2004
  generation_mode.change(fn = save_preferences, inputs = [
 
2011
  generation_mode.change(
2012
  fn=handle_generation_mode_change,
2013
  inputs=[generation_mode],
2014
+ outputs=[text_to_video_hint, image_position, input_image, end_image, end_stillness, input_video, start_button, start_button_video, no_resize, num_clean_frames, vae_batch, prompt_hint, fps_number]
2015
  )
2016
 
2017
  # Update display when the page loads
 
2019
  fn=handle_generation_mode_change, inputs = [
2020
  generation_mode
2021
  ], outputs = [
2022
+ text_to_video_hint, image_position, input_image, end_image, end_stillness, input_video, start_button, start_button_video, no_resize, num_clean_frames, vae_batch, prompt_hint, fps_number
2023
  ]
2024
  )
2025