Fabrice-TIERCELIN commited on
Commit
b138ec9
·
verified ·
1 Parent(s): a6e5759

Finish merge

Browse files
Files changed (1) hide show
  1. app.py +25 -25
app.py CHANGED
@@ -43,11 +43,10 @@ from diffusers_helper.clip_vision import hf_clip_vision_encode
43
  from diffusers_helper.bucket_tools import find_nearest_bucket
44
  from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig, HunyuanVideoTransformer3DModel, HunyuanVideoPipeline
45
 
46
-
47
  if torch.cuda.device_count() > 0:
48
  free_mem_gb = get_cuda_free_memory_gb(gpu)
49
  high_vram = free_mem_gb > 60
50
-
51
  print(f'Free VRAM {free_mem_gb} GB')
52
  print(f'High-VRAM Mode: {high_vram}')
53
 
@@ -56,37 +55,37 @@ if torch.cuda.device_count() > 0:
56
  tokenizer = LlamaTokenizerFast.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer')
57
  tokenizer_2 = CLIPTokenizer.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer_2')
58
  vae = AutoencoderKLHunyuanVideo.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='vae', torch_dtype=torch.float16).cpu()
59
-
60
  feature_extractor = SiglipImageProcessor.from_pretrained("lllyasviel/flux_redux_bfl", subfolder='feature_extractor')
61
  image_encoder = SiglipVisionModel.from_pretrained("lllyasviel/flux_redux_bfl", subfolder='image_encoder', torch_dtype=torch.float16).cpu()
62
-
63
  transformer = HunyuanVideoTransformer3DModelPacked.from_pretrained('lllyasviel/FramePack_F1_I2V_HY_20250503', torch_dtype=torch.bfloat16).cpu()
64
-
65
  vae.eval()
66
  text_encoder.eval()
67
  text_encoder_2.eval()
68
  image_encoder.eval()
69
  transformer.eval()
70
-
71
  if not high_vram:
72
  vae.enable_slicing()
73
  vae.enable_tiling()
74
-
75
  transformer.high_quality_fp32_output_for_inference = True
76
  print('transformer.high_quality_fp32_output_for_inference = True')
77
-
78
  transformer.to(dtype=torch.bfloat16)
79
  vae.to(dtype=torch.float16)
80
  image_encoder.to(dtype=torch.float16)
81
  text_encoder.to(dtype=torch.float16)
82
  text_encoder_2.to(dtype=torch.float16)
83
-
84
  vae.requires_grad_(False)
85
  text_encoder.requires_grad_(False)
86
  text_encoder_2.requires_grad_(False)
87
  image_encoder.requires_grad_(False)
88
  transformer.requires_grad_(False)
89
-
90
  if not high_vram:
91
  # DynamicSwapInstaller is same as huggingface's enable_sequential_offload but 3x faster
92
  DynamicSwapInstaller.install_model(transformer, device=gpu)
@@ -337,7 +336,7 @@ def worker(input_image, prompts, n_prompt, seed, total_second_length, latent_win
337
  load_model_as_complete(text_encoder_2, target_device=gpu)
338
 
339
  prompt_parameters = []
340
-
341
  for prompt_part in prompts:
342
  prompt_parameters.append(encode_prompt(prompt_part, n_prompt))
343
 
@@ -512,18 +511,18 @@ def get_duration(input_image, prompt, t2v, n_prompt, randomize_seed, seed, total
512
 
513
 
514
  @spaces.GPU(duration=get_duration)
515
- def process(input_image, prompt,
516
- t2v=False,
517
  n_prompt="",
518
  randomize_seed=True,
519
- seed=31337,
520
- total_second_length=5,
521
- latent_window_size=9,
522
- steps=25,
523
- cfg=1.0,
524
- gs=10.0,
525
- rs=0.0,
526
- gpu_memory_preservation=6,
527
  use_teacache=True,
528
  mp4_crf=16
529
  ):
@@ -895,7 +894,6 @@ def process_video(input_video, prompt, n_prompt, randomize_seed, seed, batch, re
895
  yield output_filename, gr.update(visible=False), desc+' Video complete.', '', gr.update(interactive=True), gr.update(interactive=False)
896
  break
897
 
898
-
899
  def end_process():
900
  stream.input_queue.push('end')
901
 
@@ -926,7 +924,8 @@ adapted from the official code repo [FramePack](https://github.com/lllyasviel/Fr
926
  t2v = gr.Checkbox(label="Do text-to-video (ignored for video extension)", value=False)
927
 
928
  with gr.Row():
929
- start_button = gr.Button(value="Start Generation", variant="primary")
 
930
  end_button = gr.Button(value="End Generation", variant="stop", interactive=False)
931
 
932
  total_second_length = gr.Slider(label="Video Length to Generate (seconds)", minimum=1, maximum=120, value=2, step=0.1)
@@ -984,6 +983,7 @@ adapted from the official code repo [FramePack](https://github.com/lllyasviel/Fr
984
  ips = [input_image, prompt, t2v, n_prompt, randomize_seed, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf]
985
  ips_video = [input_video, prompt, n_prompt, randomize_seed, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch]
986
  start_button.click(fn=process, inputs=ips, outputs=[result_video, preview_image, progress_desc, progress_bar, start_button, end_button])
 
987
  end_button.click(fn=end_process)
988
 
989
  with gr.Row(elem_id="image_examples", visible=False):
@@ -1093,7 +1093,7 @@ adapted from the official code repo [FramePack](https://github.com/lllyasviel/Fr
1093
  run_on_click = True,
1094
  fn = process_video,
1095
  inputs = ips_video,
1096
- outputs = [result_video, preview_image, progress_desc, progress_bar, start_button, end_button],
1097
  cache_examples = True,
1098
  )
1099
 
@@ -1108,7 +1108,7 @@ adapted from the official code repo [FramePack](https://github.com/lllyasviel/Fr
1108
  prompt_debug_value = prompt_debug_data
1109
  total_second_length_debug_value = total_second_length_debug_data
1110
  return []
1111
-
1112
  input_image_debug.upload(
1113
  fn=handle_field_debug_change,
1114
  inputs=[input_image_debug, input_video_debug, prompt_debug, total_second_length_debug],
 
43
  from diffusers_helper.bucket_tools import find_nearest_bucket
44
  from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig, HunyuanVideoTransformer3DModel, HunyuanVideoPipeline
45
 
 
46
  if torch.cuda.device_count() > 0:
47
  free_mem_gb = get_cuda_free_memory_gb(gpu)
48
  high_vram = free_mem_gb > 60
49
+
50
  print(f'Free VRAM {free_mem_gb} GB')
51
  print(f'High-VRAM Mode: {high_vram}')
52
 
 
55
  tokenizer = LlamaTokenizerFast.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer')
56
  tokenizer_2 = CLIPTokenizer.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer_2')
57
  vae = AutoencoderKLHunyuanVideo.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='vae', torch_dtype=torch.float16).cpu()
58
+
59
  feature_extractor = SiglipImageProcessor.from_pretrained("lllyasviel/flux_redux_bfl", subfolder='feature_extractor')
60
  image_encoder = SiglipVisionModel.from_pretrained("lllyasviel/flux_redux_bfl", subfolder='image_encoder', torch_dtype=torch.float16).cpu()
61
+
62
  transformer = HunyuanVideoTransformer3DModelPacked.from_pretrained('lllyasviel/FramePack_F1_I2V_HY_20250503', torch_dtype=torch.bfloat16).cpu()
63
+
64
  vae.eval()
65
  text_encoder.eval()
66
  text_encoder_2.eval()
67
  image_encoder.eval()
68
  transformer.eval()
69
+
70
  if not high_vram:
71
  vae.enable_slicing()
72
  vae.enable_tiling()
73
+
74
  transformer.high_quality_fp32_output_for_inference = True
75
  print('transformer.high_quality_fp32_output_for_inference = True')
76
+
77
  transformer.to(dtype=torch.bfloat16)
78
  vae.to(dtype=torch.float16)
79
  image_encoder.to(dtype=torch.float16)
80
  text_encoder.to(dtype=torch.float16)
81
  text_encoder_2.to(dtype=torch.float16)
82
+
83
  vae.requires_grad_(False)
84
  text_encoder.requires_grad_(False)
85
  text_encoder_2.requires_grad_(False)
86
  image_encoder.requires_grad_(False)
87
  transformer.requires_grad_(False)
88
+
89
  if not high_vram:
90
  # DynamicSwapInstaller is same as huggingface's enable_sequential_offload but 3x faster
91
  DynamicSwapInstaller.install_model(transformer, device=gpu)
 
336
  load_model_as_complete(text_encoder_2, target_device=gpu)
337
 
338
  prompt_parameters = []
339
+
340
  for prompt_part in prompts:
341
  prompt_parameters.append(encode_prompt(prompt_part, n_prompt))
342
 
 
511
 
512
 
513
  @spaces.GPU(duration=get_duration)
514
+ def process(input_image, prompt,
515
+ t2v=False,
516
  n_prompt="",
517
  randomize_seed=True,
518
+ seed=31337,
519
+ total_second_length=5,
520
+ latent_window_size=9,
521
+ steps=25,
522
+ cfg=1.0,
523
+ gs=10.0,
524
+ rs=0.0,
525
+ gpu_memory_preservation=6,
526
  use_teacache=True,
527
  mp4_crf=16
528
  ):
 
894
  yield output_filename, gr.update(visible=False), desc+' Video complete.', '', gr.update(interactive=True), gr.update(interactive=False)
895
  break
896
 
 
897
  def end_process():
898
  stream.input_queue.push('end')
899
 
 
924
  t2v = gr.Checkbox(label="Do text-to-video (ignored for video extension)", value=False)
925
 
926
  with gr.Row():
927
+ start_button = gr.Button(value="Generate from image", variant="primary")
928
+ start_button_video = gr.Button(value="Generate from video", variant="primary")
929
  end_button = gr.Button(value="End Generation", variant="stop", interactive=False)
930
 
931
  total_second_length = gr.Slider(label="Video Length to Generate (seconds)", minimum=1, maximum=120, value=2, step=0.1)
 
983
  ips = [input_image, prompt, t2v, n_prompt, randomize_seed, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf]
984
  ips_video = [input_video, prompt, n_prompt, randomize_seed, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch]
985
  start_button.click(fn=process, inputs=ips, outputs=[result_video, preview_image, progress_desc, progress_bar, start_button, end_button])
986
+ start_button_video.click(fn=process_video, inputs=ips_video, outputs=[result_video, preview_image, progress_desc, progress_bar, start_button_video, end_button])
987
  end_button.click(fn=end_process)
988
 
989
  with gr.Row(elem_id="image_examples", visible=False):
 
1093
  run_on_click = True,
1094
  fn = process_video,
1095
  inputs = ips_video,
1096
+ outputs = [result_video, preview_image, progress_desc, progress_bar, start_button_video, end_button],
1097
  cache_examples = True,
1098
  )
1099
 
 
1108
  prompt_debug_value = prompt_debug_data
1109
  total_second_length_debug_value = total_second_length_debug_data
1110
  return []
1111
+
1112
  input_image_debug.upload(
1113
  fn=handle_field_debug_change,
1114
  inputs=[input_image_debug, input_video_debug, prompt_debug, total_second_length_debug],