Fabrice-TIERCELIN commited on
Commit
503d1fd
·
verified ·
1 Parent(s): 1899c3f

Make code closer

Browse files
Files changed (1) hide show
  1. app.py +475 -8
app.py CHANGED
@@ -109,6 +109,8 @@ stream = AsyncStream()
109
  outputs_folder = './outputs/'
110
  os.makedirs(outputs_folder, exist_ok=True)
111
 
 
 
112
  default_local_storage = {
113
  "generation-mode": "image",
114
  }
@@ -599,6 +601,285 @@ def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_
599
  total_latent_sections = (total_second_length * fps) / (latent_window_size * 4)
600
  total_latent_sections = int(max(round(total_latent_sections), 1))
601
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
602
  if enable_preview:
603
  def callback(d):
604
  preview = d['denoised']
@@ -755,15 +1036,15 @@ def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_
755
  offload_model_from_device_for_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=8)
756
  load_model_as_complete(vae, target_device=gpu)
757
 
758
- real_history_latents = history_latents[:, :, -total_generated_latent_frames:, :, :]
759
-
760
  if history_pixels is None:
 
761
  history_pixels = vae_decode(real_history_latents, vae).cpu()
762
  else:
763
- section_latent_frames = latent_window_size * 2
764
- overlapped_frames = min(latent_window_size * 4 - 3, history_pixels.shape[2])
765
 
766
- history_pixels = soft_append_bcthw(history_pixels, vae_decode(real_history_latents[:, :, -section_latent_frames:], vae).cpu(), overlapped_frames)
 
767
 
768
  if not high_vram:
769
  unload_complete_models()
@@ -805,6 +1086,10 @@ def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_
805
  return
806
 
807
  def get_duration(input_image, image_position, prompt, generation_mode, n_prompt, randomize_seed, seed, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf):
 
 
 
 
808
  return total_second_length * 60 * (0.9 if use_teacache else 1.5) * (1 + ((steps - 25) / 100))
809
 
810
  @spaces.GPU(duration=get_duration)
@@ -828,7 +1113,13 @@ def process(input_image,
828
  mp4_crf=16
829
  ):
830
  start = time.time()
831
- global stream
 
 
 
 
 
 
832
 
833
  if torch.cuda.device_count() == 0:
834
  gr.Warning('Set this space to GPU config to make it work.')
@@ -880,13 +1171,22 @@ def process(input_image,
880
  break
881
 
882
  def get_duration_video(input_video, prompt, n_prompt, randomize_seed, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
 
 
 
883
  return total_second_length * 60 * (0.9 if use_teacache else 2.3) * (1 + ((steps - 25) / 100))
884
 
885
  # 20250506 pftq: Modified process to pass clean frame count, etc from video_encode
886
  @spaces.GPU(duration=get_duration_video)
887
  def process_video(input_video, prompt, n_prompt, randomize_seed, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
888
  start = time.time()
889
- global stream, high_vram
 
 
 
 
 
 
890
 
891
  if torch.cuda.device_count() == 0:
892
  gr.Warning('Set this space to GPU config to make it work.')
@@ -932,7 +1232,6 @@ def process_video(input_video, prompt, n_prompt, randomize_seed, seed, batch, re
932
 
933
  if flag == 'progress':
934
  preview, desc, html = data
935
- #yield gr.update(), gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True)
936
  yield output_filename, gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True) # 20250506 pftq: Keep refreshing the video in case it got hidden when the tab was in the background
937
 
938
  if flag == 'end':
@@ -1089,6 +1388,12 @@ with block:
1089
  randomize_seed = gr.Checkbox(label='Randomize seed', value=True, info='If checked, the seed is always different')
1090
  seed = gr.Slider(label="Seed", minimum=0, maximum=np.iinfo(np.int32).max, step=1, randomize=True)
1091
 
 
 
 
 
 
 
1092
  with gr.Column():
1093
  preview_image = gr.Image(label="Next Latents", height=200, visible=False)
1094
  result_video = gr.Video(label="Finished Frames", autoplay=True, show_share_button=False, height=512, loop=True)
@@ -1099,6 +1404,134 @@ with block:
1099
  ips = [input_image, image_position, final_prompt, generation_mode, n_prompt, randomize_seed, seed, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf]
1100
  ips_video = [input_video, final_prompt, n_prompt, randomize_seed, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch]
1101
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1102
  gr.Examples(
1103
  label = "Examples from image",
1104
  examples = [
@@ -1271,6 +1704,40 @@ with block:
1271
  elif generation_mode_data == "video":
1272
  return [gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = True), gr.update(visible = False), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True)]
1273
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1274
  prompt_number.change(fn=handle_prompt_number_change, inputs=[], outputs=[])
1275
  timeless_prompt.change(fn=handle_timeless_prompt_change, inputs=[timeless_prompt], outputs=[final_prompt])
1276
  start_button.click(fn = check_parameters, inputs = [
 
109
  outputs_folder = './outputs/'
110
  os.makedirs(outputs_folder, exist_ok=True)
111
 
112
+ input_image_debug_value = input_video_debug_value = prompt_debug_value = total_second_length_debug_value = None
113
+
114
  default_local_storage = {
115
  "generation-mode": "image",
116
  }
 
601
  total_latent_sections = (total_second_length * fps) / (latent_window_size * 4)
602
  total_latent_sections = int(max(round(total_latent_sections), 1))
603
 
604
+ if enable_preview:
605
+ def callback(d):
606
+ preview = d['denoised']
607
+ preview = vae_decode_fake(preview)
608
+
609
+ preview = (preview * 255.0).detach().cpu().numpy().clip(0, 255).astype(np.uint8)
610
+ preview = einops.rearrange(preview, 'b c t h w -> (b h) (t w) c')
611
+
612
+ if stream.input_queue.top() == 'end':
613
+ stream.output_queue.push(('end', None))
614
+ raise KeyboardInterrupt('User ends the task.')
615
+
616
+ current_step = d['i'] + 1
617
+ percentage = int(100.0 * current_step / steps)
618
+ hint = f'Sampling {current_step}/{steps}'
619
+ desc = f'Total frames: {int(max(0, total_generated_latent_frames * 4 - 3))}, Video length: {max(0, (total_generated_latent_frames * 4 - 3) / fps) :.2f} seconds (FPS-{fps}), Resolution: {height}px * {width}px, Seed: {seed}, Video {idx+1} of {batch}. The video is generating part {section_index+1} of {total_latent_sections}...'
620
+ stream.output_queue.push(('progress', (preview, desc, make_progress_bar_html(percentage, hint))))
621
+ return
622
+ else:
623
+ def callback(d):
624
+ return
625
+
626
+ def compute_latent(history_latents, latent_window_size, num_clean_frames, start_latent):
627
+ # 20250506 pftq: Use user-specified number of context frames, matching original allocation for num_clean_frames=2
628
+ available_frames = history_latents.shape[2] # Number of latent frames
629
+ max_pixel_frames = min(latent_window_size * 4 - 3, available_frames * 4) # Cap at available pixel frames
630
+ adjusted_latent_frames = max(1, (max_pixel_frames + 3) // 4) # Convert back to latent frames
631
+ # Adjust num_clean_frames to match original behavior: num_clean_frames=2 means 1 frame for clean_latents_1x
632
+ effective_clean_frames = max(0, num_clean_frames - 1)
633
+ effective_clean_frames = min(effective_clean_frames, available_frames - 2) if available_frames > 2 else 0 # 20250507 pftq: changed 1 to 2 for edge case for <=1 sec videos
634
+ num_2x_frames = min(2, max(1, available_frames - effective_clean_frames - 1)) if available_frames > effective_clean_frames + 1 else 0 # 20250507 pftq: subtracted 1 for edge case for <=1 sec videos
635
+ num_4x_frames = min(16, max(1, available_frames - effective_clean_frames - num_2x_frames)) if available_frames > effective_clean_frames + num_2x_frames else 0 # 20250507 pftq: Edge case for <=1 sec
636
+
637
+ total_context_frames = num_4x_frames + num_2x_frames + effective_clean_frames
638
+ total_context_frames = min(total_context_frames, available_frames) # 20250507 pftq: Edge case for <=1 sec videos
639
+
640
+ indices = torch.arange(0, sum([1, num_4x_frames, num_2x_frames, effective_clean_frames, adjusted_latent_frames])).unsqueeze(0) # 20250507 pftq: latent_window_size to adjusted_latent_frames for edge case for <=1 sec videos
641
+ clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices = indices.split(
642
+ [1, num_4x_frames, num_2x_frames, effective_clean_frames, adjusted_latent_frames], dim=1 # 20250507 pftq: latent_window_size to adjusted_latent_frames for edge case for <=1 sec videos
643
+ )
644
+ clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
645
+
646
+ # 20250506 pftq: Split history_latents dynamically based on available frames
647
+ fallback_frame_count = 2 # 20250507 pftq: Changed 0 to 2 Edge case for <=1 sec videos
648
+ context_frames = clean_latents_4x = clean_latents_2x = clean_latents_1x = history_latents[:, :, :fallback_frame_count, :, :]
649
+
650
+ if total_context_frames > 0:
651
+ context_frames = history_latents[:, :, -total_context_frames:, :, :]
652
+ split_sizes = [num_4x_frames, num_2x_frames, effective_clean_frames]
653
+ split_sizes = [s for s in split_sizes if s > 0] # Remove zero sizes
654
+ if split_sizes:
655
+ splits = context_frames.split(split_sizes, dim=2)
656
+ split_idx = 0
657
+
658
+ if num_4x_frames > 0:
659
+ clean_latents_4x = splits[split_idx]
660
+ split_idx = 1
661
+ if clean_latents_4x.shape[2] < 2: # 20250507 pftq: edge case for <=1 sec videos
662
+ print("Edge case for <=1 sec videos 4x")
663
+ clean_latents_4x = clean_latents_4x.expand(-1, -1, 2, -1, -1)
664
+
665
+ if num_2x_frames > 0 and split_idx < len(splits):
666
+ clean_latents_2x = splits[split_idx]
667
+ if clean_latents_2x.shape[2] < 2: # 20250507 pftq: edge case for <=1 sec videos
668
+ print("Edge case for <=1 sec videos 2x")
669
+ clean_latents_2x = clean_latents_2x.expand(-1, -1, 2, -1, -1)
670
+ split_idx += 1
671
+ elif clean_latents_2x.shape[2] < 2: # 20250507 pftq: edge case for <=1 sec videos
672
+ clean_latents_2x = clean_latents_4x
673
+
674
+ if effective_clean_frames > 0 and split_idx < len(splits):
675
+ clean_latents_1x = splits[split_idx]
676
+
677
+ clean_latents = torch.cat([start_latent, clean_latents_1x], dim=2)
678
+
679
+ # 20250507 pftq: Fix for <=1 sec videos.
680
+ max_frames = min(latent_window_size * 4 - 3, history_latents.shape[2] * 4)
681
+ return [max_frames, clean_latents, clean_latents_2x, clean_latents_4x, latent_indices, clean_latents, clean_latent_indices, clean_latent_2x_indices, clean_latent_4x_indices]
682
+
683
+ for idx in range(batch):
684
+ if batch > 1:
685
+ print(f"Beginning video {idx+1} of {batch} with seed {seed} ")
686
+
687
+ #job_id = generate_timestamp()
688
+ job_id = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")+f"_framepackf1-videoinput_{width}-{total_second_length}sec_seed-{seed}_steps-{steps}_distilled-{gs}_cfg-{cfg}" # 20250506 pftq: easier to read timestamp and filename
689
+
690
+ # Sampling
691
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Start sampling ...'))))
692
+
693
+ rnd = torch.Generator("cpu").manual_seed(seed)
694
+
695
+ # 20250506 pftq: Initialize history_latents with video latents
696
+ ############################################### code from image
697
+ history_latents = torch.zeros(size=(1, 16, 16 + 2 + 1, height // 8, width // 8), dtype=torch.float32).cpu()
698
+ start_latent = start_latent.to(history_latents)
699
+ # 20250506 pftq: Initialize history_pixels to fix UnboundLocalError
700
+ history_pixels = None
701
+ previous_video = None
702
+
703
+ history_latents = torch.cat([history_latents, start_latent], dim=2)
704
+ total_generated_latent_frames = 1
705
+
706
+ for section_index in range(total_latent_sections):
707
+ if stream.input_queue.top() == 'end':
708
+ stream.output_queue.push(('end', None))
709
+ return
710
+
711
+ print(f'section_index = {section_index}, total_latent_sections = {total_latent_sections}')
712
+
713
+ if len(prompt_parameters) > 0:
714
+ [llama_vec, clip_l_pooler, llama_vec_n, clip_l_pooler_n, llama_attention_mask, llama_attention_mask_n] = prompt_parameters.pop(0)
715
+
716
+ if not high_vram:
717
+ unload_complete_models()
718
+ move_model_to_device_with_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=gpu_memory_preservation)
719
+
720
+ if use_teacache:
721
+ transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
722
+ else:
723
+ transformer.initialize_teacache(enable_teacache=False)
724
+
725
+ indices = torch.arange(0, sum([1, 16, 2, 1, latent_window_size])).unsqueeze(0)
726
+ clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices = indices.split([1, 16, 2, 1, latent_window_size], dim=1)
727
+ clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
728
+
729
+ clean_latents_4x, clean_latents_2x, clean_latents_1x = history_latents[:, :, -sum([16, 2, 1]):, :, :].split([16, 2, 1], dim=2)
730
+ clean_latents = torch.cat([start_latent, clean_latents_1x], dim=2)
731
+
732
+ generated_latents = sample_hunyuan(
733
+ transformer=transformer,
734
+ sampler='unipc',
735
+ width=width,
736
+ height=height,
737
+ frames=latent_window_size * 4 - 3,
738
+ real_guidance_scale=cfg,
739
+ distilled_guidance_scale=gs,
740
+ guidance_rescale=rs,
741
+ num_inference_steps=steps,
742
+ generator=rnd,
743
+ prompt_embeds=llama_vec,
744
+ prompt_embeds_mask=llama_attention_mask,
745
+ prompt_poolers=clip_l_pooler,
746
+ negative_prompt_embeds=llama_vec_n,
747
+ negative_prompt_embeds_mask=llama_attention_mask_n,
748
+ negative_prompt_poolers=clip_l_pooler_n,
749
+ device=gpu,
750
+ dtype=torch.bfloat16,
751
+ image_embeddings=image_encoder_last_hidden_state,
752
+ latent_indices=latent_indices,
753
+ clean_latents=clean_latents,
754
+ clean_latent_indices=clean_latent_indices,
755
+ clean_latents_2x=clean_latents_2x,
756
+ clean_latent_2x_indices=clean_latent_2x_indices,
757
+ clean_latents_4x=clean_latents_4x,
758
+ clean_latent_4x_indices=clean_latent_4x_indices,
759
+ callback=callback,
760
+ )
761
+
762
+ total_generated_latent_frames += int(generated_latents.shape[2])
763
+ history_latents = torch.cat([history_latents, generated_latents.to(history_latents)], dim=2)
764
+
765
+ if not high_vram:
766
+ offload_model_from_device_for_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=8)
767
+ load_model_as_complete(vae, target_device=gpu)
768
+
769
+ if history_pixels is None:
770
+ real_history_latents = history_latents[:, :, -total_generated_latent_frames:, :, :]
771
+ history_pixels = vae_decode(real_history_latents, vae).cpu()
772
+ else:
773
+ section_latent_frames = latent_window_size * 2
774
+ overlapped_frames = min(latent_window_size * 4 - 3, history_pixels.shape[2])
775
+
776
+ real_history_latents = history_latents[:, :, -min(total_generated_latent_frames, section_latent_frames):, :, :]
777
+ history_pixels = soft_append_bcthw(history_pixels, vae_decode(real_history_latents, vae).cpu(), overlapped_frames)
778
+
779
+ if not high_vram:
780
+ unload_complete_models()
781
+
782
+ if enable_preview or section_index == total_latent_sections - 1:
783
+ output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
784
+
785
+ # 20250506 pftq: Use input video FPS for output
786
+ save_bcthw_as_mp4(history_pixels, output_filename, fps=fps, crf=mp4_crf)
787
+ print(f"Latest video saved: {output_filename}")
788
+ # 20250508 pftq: Save prompt to mp4 metadata comments
789
+ set_mp4_comments_imageio_ffmpeg(output_filename, f"Prompt: {prompts} | Negative Prompt: {n_prompt}");
790
+ print(f"Prompt saved to mp4 metadata comments: {output_filename}")
791
+
792
+ # 20250506 pftq: Clean up previous partial files
793
+ if previous_video is not None and os.path.exists(previous_video):
794
+ try:
795
+ os.remove(previous_video)
796
+ print(f"Previous partial video deleted: {previous_video}")
797
+ except Exception as e:
798
+ print(f"Error deleting previous partial video {previous_video}: {e}")
799
+ previous_video = output_filename
800
+
801
+ print(f'Decoded. Current latent shape {real_history_latents.shape}; pixel shape {history_pixels.shape}')
802
+
803
+ stream.output_queue.push(('file', output_filename))
804
+
805
+ seed = (seed + 1) % np.iinfo(np.int32).max
806
+
807
+ except:
808
+ traceback.print_exc()
809
+
810
+ if not high_vram:
811
+ unload_complete_models(
812
+ text_encoder, text_encoder_2, image_encoder, vae, transformer
813
+ )
814
+
815
+ stream.output_queue.push(('end', None))
816
+ return
817
+
818
+ # 20250506 pftq: Modified worker to accept video input and clean frame count
819
+ @spaces.GPU()
820
+ @torch.no_grad()
821
+ def worker_video_original(input_video, prompts, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
822
+ def encode_prompt(prompt, n_prompt):
823
+ llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
824
+
825
+ if cfg == 1:
826
+ llama_vec_n, clip_l_pooler_n = torch.zeros_like(llama_vec), torch.zeros_like(clip_l_pooler)
827
+ else:
828
+ llama_vec_n, clip_l_pooler_n = encode_prompt_conds(n_prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
829
+
830
+ llama_vec, llama_attention_mask = crop_or_pad_yield_mask(llama_vec, length=512)
831
+ llama_vec_n, llama_attention_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512)
832
+
833
+ llama_vec = llama_vec.to(transformer.dtype)
834
+ llama_vec_n = llama_vec_n.to(transformer.dtype)
835
+ clip_l_pooler = clip_l_pooler.to(transformer.dtype)
836
+ clip_l_pooler_n = clip_l_pooler_n.to(transformer.dtype)
837
+ return [llama_vec, clip_l_pooler, llama_vec_n, clip_l_pooler_n, llama_attention_mask, llama_attention_mask_n]
838
+
839
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
840
+
841
+ try:
842
+ # Clean GPU
843
+ if not high_vram:
844
+ unload_complete_models(
845
+ text_encoder, text_encoder_2, image_encoder, vae, transformer
846
+ )
847
+
848
+ # Text encoding
849
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Text encoding ...'))))
850
+
851
+ if not high_vram:
852
+ fake_diffusers_current_device(text_encoder, gpu) # since we only encode one text - that is one model move and one encode, offload is same time consumption since it is also one load and one encode.
853
+ load_model_as_complete(text_encoder_2, target_device=gpu)
854
+
855
+ prompt_parameters = []
856
+
857
+ for prompt_part in prompts:
858
+ prompt_parameters.append(encode_prompt(prompt_part, n_prompt))
859
+
860
+ # 20250506 pftq: Processing input video instead of image
861
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Video processing ...'))))
862
+
863
+ # 20250506 pftq: Encode video
864
+ start_latent, input_image_np, video_latents, fps, height, width = video_encode(input_video, resolution, no_resize, vae, vae_batch_size=vae_batch, device=gpu)[:6]
865
+ start_latent = start_latent.to(dtype=torch.float32).cpu()
866
+ video_latents = video_latents.cpu()
867
+
868
+ # CLIP Vision
869
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'CLIP Vision encoding ...'))))
870
+
871
+ if not high_vram:
872
+ load_model_as_complete(image_encoder, target_device=gpu)
873
+
874
+ image_encoder_output = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder)
875
+ image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
876
+
877
+ # Dtype
878
+ image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
879
+
880
+ total_latent_sections = (total_second_length * fps) / (latent_window_size * 4)
881
+ total_latent_sections = int(max(round(total_latent_sections), 1))
882
+
883
  if enable_preview:
884
  def callback(d):
885
  preview = d['denoised']
 
1036
  offload_model_from_device_for_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=8)
1037
  load_model_as_complete(vae, target_device=gpu)
1038
 
 
 
1039
  if history_pixels is None:
1040
+ real_history_latents = history_latents[:, :, -total_generated_latent_frames:, :, :]
1041
  history_pixels = vae_decode(real_history_latents, vae).cpu()
1042
  else:
1043
+ section_latent_frames = latent_window_size * 2
1044
+ overlapped_frames = min(latent_window_size * 4 - 3, history_pixels.shape[2])
1045
 
1046
+ real_history_latents = history_latents[:, :, -min(total_generated_latent_frames, section_latent_frames):, :, :]
1047
+ history_pixels = soft_append_bcthw(history_pixels, vae_decode(real_history_latents, vae).cpu(), overlapped_frames)
1048
 
1049
  if not high_vram:
1050
  unload_complete_models()
 
1086
  return
1087
 
1088
  def get_duration(input_image, image_position, prompt, generation_mode, n_prompt, randomize_seed, seed, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf):
1089
+ global total_second_length_debug_value
1090
+
1091
+ if total_second_length_debug_value is not None:
1092
+ return min(total_second_length_debug_value * 60 * 10, 600)
1093
  return total_second_length * 60 * (0.9 if use_teacache else 1.5) * (1 + ((steps - 25) / 100))
1094
 
1095
  @spaces.GPU(duration=get_duration)
 
1113
  mp4_crf=16
1114
  ):
1115
  start = time.time()
1116
+ global stream, input_image_debug_value, prompt_debug_value, total_second_length_debug_value
1117
+
1118
+ if input_image_debug_value is not None or prompt_debug_value is not None or total_second_length_debug_value is not None:
1119
+ input_image = input_image_debug_value
1120
+ prompt = prompt_debug_value
1121
+ total_second_length = total_second_length_debug_value
1122
+ input_image_debug_value = prompt_debug_value = total_second_length_debug_value = None
1123
 
1124
  if torch.cuda.device_count() == 0:
1125
  gr.Warning('Set this space to GPU config to make it work.')
 
1171
  break
1172
 
1173
  def get_duration_video(input_video, prompt, n_prompt, randomize_seed, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
1174
+ global total_second_length_debug_value
1175
+ if total_second_length_debug_value is not None:
1176
+ return min(total_second_length_debug_value * 60 * 10, 600)
1177
  return total_second_length * 60 * (0.9 if use_teacache else 2.3) * (1 + ((steps - 25) / 100))
1178
 
1179
  # 20250506 pftq: Modified process to pass clean frame count, etc from video_encode
1180
  @spaces.GPU(duration=get_duration_video)
1181
  def process_video(input_video, prompt, n_prompt, randomize_seed, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
1182
  start = time.time()
1183
+ global stream, high_vram, input_video_debug_value, prompt_debug_value, total_second_length_debug_value
1184
+
1185
+ if input_video_debug_value is not None or prompt_debug_value is not None or total_second_length_debug_value is not None:
1186
+ input_video = input_video_debug_value
1187
+ prompt = prompt_debug_value
1188
+ total_second_length = total_second_length_debug_value
1189
+ input_video_debug_value = prompt_debug_value = total_second_length_debug_value = None
1190
 
1191
  if torch.cuda.device_count() == 0:
1192
  gr.Warning('Set this space to GPU config to make it work.')
 
1232
 
1233
  if flag == 'progress':
1234
  preview, desc, html = data
 
1235
  yield output_filename, gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True) # 20250506 pftq: Keep refreshing the video in case it got hidden when the tab was in the background
1236
 
1237
  if flag == 'end':
 
1388
  randomize_seed = gr.Checkbox(label='Randomize seed', value=True, info='If checked, the seed is always different')
1389
  seed = gr.Slider(label="Seed", minimum=0, maximum=np.iinfo(np.int32).max, step=1, randomize=True)
1390
 
1391
+ with gr.Accordion("Debug", open=False):
1392
+ input_image_debug = gr.Image(type="numpy", label="Image Debug", height=320)
1393
+ input_video_debug = gr.Video(sources='upload', label="Input Video Debug", height=320)
1394
+ prompt_debug = gr.Textbox(label="Prompt Debug", value='')
1395
+ total_second_length_debug = gr.Slider(label="Additional Video Length to Generate (seconds) Debug", minimum=1, maximum=120, value=1, step=0.1)
1396
+
1397
  with gr.Column():
1398
  preview_image = gr.Image(label="Next Latents", height=200, visible=False)
1399
  result_video = gr.Video(label="Finished Frames", autoplay=True, show_share_button=False, height=512, loop=True)
 
1404
  ips = [input_image, image_position, final_prompt, generation_mode, n_prompt, randomize_seed, seed, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf]
1405
  ips_video = [input_video, final_prompt, n_prompt, randomize_seed, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch]
1406
 
1407
+ with gr.Row(elem_id="image_examples", visible=False):
1408
+ gr.Examples(
1409
+ label = "Examples from image",
1410
+ examples = [
1411
+ [
1412
+ "./img_examples/Example2.webp", # input_image
1413
+ 0, # image_position
1414
+ "A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The man talks and the woman listens; A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks and the man listens",
1415
+ "image", # generation_mode
1416
+ "Missing arm, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
1417
+ True, # randomize_seed
1418
+ 42, # seed
1419
+ 672, # resolution
1420
+ 1, # total_second_length
1421
+ 9, # latent_window_size
1422
+ 25, # steps
1423
+ 1.0, # cfg
1424
+ 10.0, # gs
1425
+ 0.0, # rs
1426
+ 6, # gpu_memory_preservation
1427
+ False, # enable_preview
1428
+ False, # use_teacache
1429
+ 16 # mp4_crf
1430
+ ],
1431
+ [
1432
+ "./img_examples/Example1.png", # input_image
1433
+ 0, # image_position
1434
+ "A dolphin emerges from the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
1435
+ "image", # generation_mode
1436
+ "Missing arm, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
1437
+ True, # randomize_seed
1438
+ 42, # seed
1439
+ 672, # resolution
1440
+ 1, # total_second_length
1441
+ 9, # latent_window_size
1442
+ 25, # steps
1443
+ 1.0, # cfg
1444
+ 10.0, # gs
1445
+ 0.0, # rs
1446
+ 6, # gpu_memory_preservation
1447
+ False, # enable_preview
1448
+ True, # use_teacache
1449
+ 16 # mp4_crf
1450
+ ],
1451
+ [
1452
+ "./img_examples/Example4.webp", # input_image
1453
+ 100, # image_position
1454
+ "A building starting to explode, photorealistic, realisitc, 8k, insanely detailed",
1455
+ "image", # generation_mode
1456
+ "Missing arm, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
1457
+ True, # randomize_seed
1458
+ 42, # seed
1459
+ 672, # resolution
1460
+ 1, # total_second_length
1461
+ 9, # latent_window_size
1462
+ 25, # steps
1463
+ 1.0, # cfg
1464
+ 10.0, # gs
1465
+ 0.0, # rs
1466
+ 6, # gpu_memory_preservation
1467
+ False, # enable_preview
1468
+ False, # use_teacache
1469
+ 16 # mp4_crf
1470
+ ],
1471
+ ],
1472
+ run_on_click = True,
1473
+ fn = process,
1474
+ inputs = ips,
1475
+ outputs = [result_video, preview_image, progress_desc, progress_bar, start_button, end_button],
1476
+ cache_examples = torch.cuda.device_count() > 0,
1477
+ )
1478
+
1479
+ with gr.Row(elem_id="video_examples", visible=False):
1480
+ gr.Examples(
1481
+ label = "Examples from video",
1482
+ examples = [
1483
+ [
1484
+ "./img_examples/Example1.mp4", # input_video
1485
+ "View of the sea as far as the eye can see, from the seaside, a piece of land is barely visible on the horizon at the middle, the sky is radiant, reflections of the sun in the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
1486
+ "Missing arm, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
1487
+ True, # randomize_seed
1488
+ 42, # seed
1489
+ 1, # batch
1490
+ 672, # resolution
1491
+ 1, # total_second_length
1492
+ 9, # latent_window_size
1493
+ 25, # steps
1494
+ 1.0, # cfg
1495
+ 10.0, # gs
1496
+ 0.0, # rs
1497
+ 6, # gpu_memory_preservation
1498
+ False, # enable_preview
1499
+ False, # use_teacache
1500
+ False, # no_resize
1501
+ 16, # mp4_crf
1502
+ 5, # num_clean_frames
1503
+ default_vae
1504
+ ],
1505
+ [
1506
+ "./img_examples/Example1.mp4", # input_video
1507
+ "View of the sea as far as the eye can see, from the seaside, a piece of land is barely visible on the horizon at the middle, the sky is radiant, reflections of the sun in the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
1508
+ "Missing arm, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
1509
+ True, # randomize_seed
1510
+ 42, # seed
1511
+ 1, # batch
1512
+ 672, # resolution
1513
+ 1, # total_second_length
1514
+ 9, # latent_window_size
1515
+ 25, # steps
1516
+ 1.0, # cfg
1517
+ 10.0, # gs
1518
+ 0.0, # rs
1519
+ 6, # gpu_memory_preservation
1520
+ False, # enable_preview
1521
+ True, # use_teacache
1522
+ False, # no_resize
1523
+ 16, # mp4_crf
1524
+ 5, # num_clean_frames
1525
+ default_vae
1526
+ ],
1527
+ ],
1528
+ run_on_click = True,
1529
+ fn = process_video,
1530
+ inputs = ips_video,
1531
+ outputs = [result_video, preview_image, progress_desc, progress_bar, start_button_video, end_button],
1532
+ cache_examples = torch.cuda.device_count() > 0,
1533
+ )
1534
+
1535
  gr.Examples(
1536
  label = "Examples from image",
1537
  examples = [
 
1704
  elif generation_mode_data == "video":
1705
  return [gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = True), gr.update(visible = False), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True)]
1706
 
1707
+
1708
+ def handle_field_debug_change(input_image_debug_data, input_video_debug_data, prompt_debug_data, total_second_length_debug_data):
1709
+ print("handle_field_debug_change")
1710
+ global input_image_debug_value, input_video_debug_value, prompt_debug_value, total_second_length_debug_value
1711
+ input_image_debug_value = input_image_debug_data
1712
+ input_video_debug_value = input_video_debug_data
1713
+ prompt_debug_value = prompt_debug_data
1714
+ total_second_length_debug_value = total_second_length_debug_data
1715
+ return []
1716
+
1717
+ input_image_debug.upload(
1718
+ fn=handle_field_debug_change,
1719
+ inputs=[input_image_debug, input_video_debug, prompt_debug, total_second_length_debug],
1720
+ outputs=[]
1721
+ )
1722
+
1723
+ input_video_debug.upload(
1724
+ fn=handle_field_debug_change,
1725
+ inputs=[input_image_debug, input_video_debug, prompt_debug, total_second_length_debug],
1726
+ outputs=[]
1727
+ )
1728
+
1729
+ prompt_debug.change(
1730
+ fn=handle_field_debug_change,
1731
+ inputs=[input_image_debug, input_video_debug, prompt_debug, total_second_length_debug],
1732
+ outputs=[]
1733
+ )
1734
+
1735
+ total_second_length_debug.change(
1736
+ fn=handle_field_debug_change,
1737
+ inputs=[input_image_debug, input_video_debug, prompt_debug, total_second_length_debug],
1738
+ outputs=[]
1739
+ )
1740
+
1741
  prompt_number.change(fn=handle_prompt_number_change, inputs=[], outputs=[])
1742
  timeless_prompt.change(fn=handle_timeless_prompt_change, inputs=[timeless_prompt], outputs=[final_prompt])
1743
  start_button.click(fn = check_parameters, inputs = [