Fabrice-TIERCELIN commited on
Commit
7c17dfa
·
verified ·
1 Parent(s): a6ede4c
Files changed (1) hide show
  1. app.py +27 -27
app.py CHANGED
@@ -375,10 +375,10 @@ def worker(input_image, image_position, prompts, n_prompt, seed, resolution, tot
375
  llama_vec, llama_attention_mask = crop_or_pad_yield_mask(llama_vec, length=512)
376
  llama_vec_n, llama_attention_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512)
377
 
378
- llama_vec = llama_vec.to(transformer.dtype)
379
- llama_vec_n = llama_vec_n.to(transformer.dtype)
380
- clip_l_pooler = clip_l_pooler.to(transformer.dtype)
381
- clip_l_pooler_n = clip_l_pooler_n.to(transformer.dtype)
382
  return [llama_vec, clip_l_pooler, llama_vec_n, clip_l_pooler_n, llama_attention_mask, llama_attention_mask_n]
383
 
384
  total_latent_sections = (total_second_length * fps_number) / (latent_window_size * 4)
@@ -396,7 +396,7 @@ def worker(input_image, image_position, prompts, n_prompt, seed, resolution, tot
396
  # Clean GPU
397
  if not high_vram:
398
  unload_complete_models(
399
- text_encoder, text_encoder_2, image_encoder, vae, transformer
400
  )
401
 
402
  # Text encoding
@@ -461,7 +461,7 @@ def worker(input_image, image_position, prompts, n_prompt, seed, resolution, tot
461
 
462
  # Dtype
463
 
464
- image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
465
 
466
  # Sampling
467
 
@@ -529,7 +529,7 @@ def worker(input_image, image_position, prompts, n_prompt, seed, resolution, tot
529
  history_pixels = soft_append_bcthw(vae_decode(real_history_latents, vae).cpu(), history_pixels, overlapped_frames)
530
 
531
  if not high_vram:
532
- unload_complete_models(text_encoder, text_encoder_2, image_encoder, vae, transformer)
533
 
534
  if enable_preview or section_index == (0 if first_section_index == (total_latent_sections - 1) else (total_latent_sections - 1)):
535
  output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
@@ -557,12 +557,12 @@ def worker(input_image, image_position, prompts, n_prompt, seed, resolution, tot
557
 
558
  if not high_vram:
559
  unload_complete_models()
560
- move_model_to_device_with_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=gpu_memory_preservation)
561
 
562
  if use_teacache:
563
- transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
564
  else:
565
- transformer.initialize_teacache(enable_teacache=False)
566
 
567
  if forward:
568
  clean_latents_4x, clean_latents_2x, clean_latents_1x = history_latents[:, :, -(16 + 2 + 1):, :, :].split([16, 2, 1], dim=2)
@@ -572,7 +572,7 @@ def worker(input_image, image_position, prompts, n_prompt, seed, resolution, tot
572
  clean_latents = torch.cat([clean_latents_1x, start_latent], dim=2)
573
 
574
  generated_latents = sample_hunyuan(
575
- transformer=transformer,
576
  sampler='unipc',
577
  width=width,
578
  height=height,
@@ -602,7 +602,7 @@ def worker(input_image, image_position, prompts, n_prompt, seed, resolution, tot
602
  callback=callback,
603
  )
604
 
605
- [total_generated_latent_frames, history_latents, history_pixels] = post_process(forward, generated_latents, total_generated_latent_frames, history_latents, high_vram, transformer, gpu, vae, history_pixels, latent_window_size, enable_preview, section_index, total_latent_sections, outputs_folder, mp4_crf, stream)
606
 
607
  if not forward:
608
  if section_index > 0:
@@ -626,7 +626,7 @@ def worker(input_image, image_position, prompts, n_prompt, seed, resolution, tot
626
 
627
  if not high_vram:
628
  unload_complete_models(
629
- text_encoder, text_encoder_2, image_encoder, vae, transformer
630
  )
631
 
632
  stream.output_queue.push(('end', None))
@@ -681,10 +681,10 @@ def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_
681
  llama_vec, llama_attention_mask = crop_or_pad_yield_mask(llama_vec, length=512)
682
  llama_vec_n, llama_attention_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512)
683
 
684
- llama_vec = llama_vec.to(transformer.dtype)
685
- llama_vec_n = llama_vec_n.to(transformer.dtype)
686
- clip_l_pooler = clip_l_pooler.to(transformer.dtype)
687
- clip_l_pooler_n = clip_l_pooler_n.to(transformer.dtype)
688
  return [llama_vec, clip_l_pooler, llama_vec_n, clip_l_pooler_n, llama_attention_mask, llama_attention_mask_n]
689
 
690
  stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
@@ -704,7 +704,7 @@ def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_
704
  # Clean GPU
705
  if not high_vram:
706
  unload_complete_models(
707
- text_encoder, text_encoder_2, image_encoder, vae, transformer
708
  )
709
 
710
  # Text encoding
@@ -740,7 +740,7 @@ def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_
740
  image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
741
 
742
  # Dtype
743
- image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
744
 
745
  if enable_preview:
746
  def callback(d):
@@ -852,17 +852,17 @@ def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_
852
 
853
  if not high_vram:
854
  unload_complete_models()
855
- move_model_to_device_with_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=gpu_memory_preservation)
856
 
857
  if use_teacache:
858
- transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
859
  else:
860
- transformer.initialize_teacache(enable_teacache=False)
861
 
862
  [max_frames, clean_latents, clean_latents_2x, clean_latents_4x, latent_indices, clean_latents, clean_latent_indices, clean_latent_2x_indices, clean_latent_4x_indices] = compute_latent(history_latents, latent_window_size, num_clean_frames, start_latent)
863
 
864
  generated_latents = sample_hunyuan(
865
- transformer=transformer,
866
  sampler='unipc',
867
  width=width,
868
  height=height,
@@ -895,7 +895,7 @@ def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_
895
  history_latents = torch.cat([history_latents, generated_latents.to(history_latents)], dim=2)
896
 
897
  if not high_vram:
898
- offload_model_from_device_for_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=8)
899
  load_model_as_complete(vae, target_device=gpu)
900
 
901
  if history_pixels is None:
@@ -909,7 +909,7 @@ def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_
909
  history_pixels = soft_append_bcthw(history_pixels, vae_decode(real_history_latents, vae).cpu(), overlapped_frames)
910
 
911
  if not high_vram:
912
- unload_complete_models(text_encoder, text_encoder_2, image_encoder, vae, transformer)
913
 
914
  if enable_preview or section_index == total_latent_sections - 1:
915
  output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
@@ -941,7 +941,7 @@ def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_
941
 
942
  if not high_vram:
943
  unload_complete_models(
944
- text_encoder, text_encoder_2, image_encoder, vae, transformer
945
  )
946
 
947
  stream.output_queue.push(('end', None))
@@ -1182,7 +1182,7 @@ def process_video(input_video, prompt, n_prompt, randomize_seed, seed, auto_allo
1182
  high_vram = False
1183
  vae.enable_slicing()
1184
  vae.enable_tiling()
1185
- DynamicSwapInstaller.install_model(transformer, device=gpu)
1186
  DynamicSwapInstaller.install_model(text_encoder, device=gpu)
1187
 
1188
  # 20250508 pftq: automatically set distilled cfg to 1 if cfg is used
 
375
  llama_vec, llama_attention_mask = crop_or_pad_yield_mask(llama_vec, length=512)
376
  llama_vec_n, llama_attention_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512)
377
 
378
+ llama_vec = llama_vec.to(transformer[0].dtype)
379
+ llama_vec_n = llama_vec_n.to(transformer[0].dtype)
380
+ clip_l_pooler = clip_l_pooler.to(transformer[0].dtype)
381
+ clip_l_pooler_n = clip_l_pooler_n.to(transformer[0].dtype)
382
  return [llama_vec, clip_l_pooler, llama_vec_n, clip_l_pooler_n, llama_attention_mask, llama_attention_mask_n]
383
 
384
  total_latent_sections = (total_second_length * fps_number) / (latent_window_size * 4)
 
396
  # Clean GPU
397
  if not high_vram:
398
  unload_complete_models(
399
+ text_encoder, text_encoder_2, image_encoder, vae, transformer[0]
400
  )
401
 
402
  # Text encoding
 
461
 
462
  # Dtype
463
 
464
+ image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer[0].dtype)
465
 
466
  # Sampling
467
 
 
529
  history_pixels = soft_append_bcthw(vae_decode(real_history_latents, vae).cpu(), history_pixels, overlapped_frames)
530
 
531
  if not high_vram:
532
+ unload_complete_models(text_encoder, text_encoder_2, image_encoder, vae, transformer[0])
533
 
534
  if enable_preview or section_index == (0 if first_section_index == (total_latent_sections - 1) else (total_latent_sections - 1)):
535
  output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
 
557
 
558
  if not high_vram:
559
  unload_complete_models()
560
+ move_model_to_device_with_memory_preservation(transformer[0], target_device=gpu, preserved_memory_gb=gpu_memory_preservation)
561
 
562
  if use_teacache:
563
+ transformer[0].initialize_teacache(enable_teacache=True, num_steps=steps)
564
  else:
565
+ transformer[0].initialize_teacache(enable_teacache=False)
566
 
567
  if forward:
568
  clean_latents_4x, clean_latents_2x, clean_latents_1x = history_latents[:, :, -(16 + 2 + 1):, :, :].split([16, 2, 1], dim=2)
 
572
  clean_latents = torch.cat([clean_latents_1x, start_latent], dim=2)
573
 
574
  generated_latents = sample_hunyuan(
575
+ transformer=transformer[0],
576
  sampler='unipc',
577
  width=width,
578
  height=height,
 
602
  callback=callback,
603
  )
604
 
605
+ [total_generated_latent_frames, history_latents, history_pixels] = post_process(forward, generated_latents, total_generated_latent_frames, history_latents, high_vram, transformer[0], gpu, vae, history_pixels, latent_window_size, enable_preview, section_index, total_latent_sections, outputs_folder, mp4_crf, stream)
606
 
607
  if not forward:
608
  if section_index > 0:
 
626
 
627
  if not high_vram:
628
  unload_complete_models(
629
+ text_encoder, text_encoder_2, image_encoder, vae, transformer[0]
630
  )
631
 
632
  stream.output_queue.push(('end', None))
 
681
  llama_vec, llama_attention_mask = crop_or_pad_yield_mask(llama_vec, length=512)
682
  llama_vec_n, llama_attention_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512)
683
 
684
+ llama_vec = llama_vec.to(transformer[0].dtype)
685
+ llama_vec_n = llama_vec_n.to(transformer[0].dtype)
686
+ clip_l_pooler = clip_l_pooler.to(transformer[0].dtype)
687
+ clip_l_pooler_n = clip_l_pooler_n.to(transformer[0].dtype)
688
  return [llama_vec, clip_l_pooler, llama_vec_n, clip_l_pooler_n, llama_attention_mask, llama_attention_mask_n]
689
 
690
  stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
 
704
  # Clean GPU
705
  if not high_vram:
706
  unload_complete_models(
707
+ text_encoder, text_encoder_2, image_encoder, vae, transformer[0]
708
  )
709
 
710
  # Text encoding
 
740
  image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
741
 
742
  # Dtype
743
+ image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer[0].dtype)
744
 
745
  if enable_preview:
746
  def callback(d):
 
852
 
853
  if not high_vram:
854
  unload_complete_models()
855
+ move_model_to_device_with_memory_preservation(transformer[0], target_device=gpu, preserved_memory_gb=gpu_memory_preservation)
856
 
857
  if use_teacache:
858
+ transformer[0].initialize_teacache(enable_teacache=True, num_steps=steps)
859
  else:
860
+ transformer[0].initialize_teacache(enable_teacache=False)
861
 
862
  [max_frames, clean_latents, clean_latents_2x, clean_latents_4x, latent_indices, clean_latents, clean_latent_indices, clean_latent_2x_indices, clean_latent_4x_indices] = compute_latent(history_latents, latent_window_size, num_clean_frames, start_latent)
863
 
864
  generated_latents = sample_hunyuan(
865
+ transformer=transformer[0],
866
  sampler='unipc',
867
  width=width,
868
  height=height,
 
895
  history_latents = torch.cat([history_latents, generated_latents.to(history_latents)], dim=2)
896
 
897
  if not high_vram:
898
+ offload_model_from_device_for_memory_preservation(transformer[0], target_device=gpu, preserved_memory_gb=8)
899
  load_model_as_complete(vae, target_device=gpu)
900
 
901
  if history_pixels is None:
 
909
  history_pixels = soft_append_bcthw(history_pixels, vae_decode(real_history_latents, vae).cpu(), overlapped_frames)
910
 
911
  if not high_vram:
912
+ unload_complete_models(text_encoder, text_encoder_2, image_encoder, vae, transformer[0])
913
 
914
  if enable_preview or section_index == total_latent_sections - 1:
915
  output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
 
941
 
942
  if not high_vram:
943
  unload_complete_models(
944
+ text_encoder, text_encoder_2, image_encoder, vae, transformer[0]
945
  )
946
 
947
  stream.output_queue.push(('end', None))
 
1182
  high_vram = False
1183
  vae.enable_slicing()
1184
  vae.enable_tiling()
1185
+ DynamicSwapInstaller.install_model(transformer[0], device=gpu)
1186
  DynamicSwapInstaller.install_model(text_encoder, device=gpu)
1187
 
1188
  # 20250508 pftq: automatically set distilled cfg to 1 if cfg is used