Finish merge
Browse files
app.py
CHANGED
|
@@ -43,11 +43,10 @@ from diffusers_helper.clip_vision import hf_clip_vision_encode
|
|
| 43 |
from diffusers_helper.bucket_tools import find_nearest_bucket
|
| 44 |
from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig, HunyuanVideoTransformer3DModel, HunyuanVideoPipeline
|
| 45 |
|
| 46 |
-
|
| 47 |
if torch.cuda.device_count() > 0:
|
| 48 |
free_mem_gb = get_cuda_free_memory_gb(gpu)
|
| 49 |
high_vram = free_mem_gb > 60
|
| 50 |
-
|
| 51 |
print(f'Free VRAM {free_mem_gb} GB')
|
| 52 |
print(f'High-VRAM Mode: {high_vram}')
|
| 53 |
|
|
@@ -56,37 +55,37 @@ if torch.cuda.device_count() > 0:
|
|
| 56 |
tokenizer = LlamaTokenizerFast.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer')
|
| 57 |
tokenizer_2 = CLIPTokenizer.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer_2')
|
| 58 |
vae = AutoencoderKLHunyuanVideo.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='vae', torch_dtype=torch.float16).cpu()
|
| 59 |
-
|
| 60 |
feature_extractor = SiglipImageProcessor.from_pretrained("lllyasviel/flux_redux_bfl", subfolder='feature_extractor')
|
| 61 |
image_encoder = SiglipVisionModel.from_pretrained("lllyasviel/flux_redux_bfl", subfolder='image_encoder', torch_dtype=torch.float16).cpu()
|
| 62 |
-
|
| 63 |
transformer = HunyuanVideoTransformer3DModelPacked.from_pretrained('lllyasviel/FramePack_F1_I2V_HY_20250503', torch_dtype=torch.bfloat16).cpu()
|
| 64 |
-
|
| 65 |
vae.eval()
|
| 66 |
text_encoder.eval()
|
| 67 |
text_encoder_2.eval()
|
| 68 |
image_encoder.eval()
|
| 69 |
transformer.eval()
|
| 70 |
-
|
| 71 |
if not high_vram:
|
| 72 |
vae.enable_slicing()
|
| 73 |
vae.enable_tiling()
|
| 74 |
-
|
| 75 |
transformer.high_quality_fp32_output_for_inference = True
|
| 76 |
print('transformer.high_quality_fp32_output_for_inference = True')
|
| 77 |
-
|
| 78 |
transformer.to(dtype=torch.bfloat16)
|
| 79 |
vae.to(dtype=torch.float16)
|
| 80 |
image_encoder.to(dtype=torch.float16)
|
| 81 |
text_encoder.to(dtype=torch.float16)
|
| 82 |
text_encoder_2.to(dtype=torch.float16)
|
| 83 |
-
|
| 84 |
vae.requires_grad_(False)
|
| 85 |
text_encoder.requires_grad_(False)
|
| 86 |
text_encoder_2.requires_grad_(False)
|
| 87 |
image_encoder.requires_grad_(False)
|
| 88 |
transformer.requires_grad_(False)
|
| 89 |
-
|
| 90 |
if not high_vram:
|
| 91 |
# DynamicSwapInstaller is same as huggingface's enable_sequential_offload but 3x faster
|
| 92 |
DynamicSwapInstaller.install_model(transformer, device=gpu)
|
|
@@ -337,7 +336,7 @@ def worker(input_image, prompts, n_prompt, seed, total_second_length, latent_win
|
|
| 337 |
load_model_as_complete(text_encoder_2, target_device=gpu)
|
| 338 |
|
| 339 |
prompt_parameters = []
|
| 340 |
-
|
| 341 |
for prompt_part in prompts:
|
| 342 |
prompt_parameters.append(encode_prompt(prompt_part, n_prompt))
|
| 343 |
|
|
@@ -512,18 +511,18 @@ def get_duration(input_image, prompt, t2v, n_prompt, randomize_seed, seed, total
|
|
| 512 |
|
| 513 |
|
| 514 |
@spaces.GPU(duration=get_duration)
|
| 515 |
-
def process(input_image, prompt,
|
| 516 |
-
t2v=False,
|
| 517 |
n_prompt="",
|
| 518 |
randomize_seed=True,
|
| 519 |
-
seed=31337,
|
| 520 |
-
total_second_length=5,
|
| 521 |
-
latent_window_size=9,
|
| 522 |
-
steps=25,
|
| 523 |
-
cfg=1.0,
|
| 524 |
-
gs=10.0,
|
| 525 |
-
rs=0.0,
|
| 526 |
-
gpu_memory_preservation=6,
|
| 527 |
use_teacache=True,
|
| 528 |
mp4_crf=16
|
| 529 |
):
|
|
@@ -895,7 +894,6 @@ def process_video(input_video, prompt, n_prompt, randomize_seed, seed, batch, re
|
|
| 895 |
yield output_filename, gr.update(visible=False), desc+' Video complete.', '', gr.update(interactive=True), gr.update(interactive=False)
|
| 896 |
break
|
| 897 |
|
| 898 |
-
|
| 899 |
def end_process():
|
| 900 |
stream.input_queue.push('end')
|
| 901 |
|
|
@@ -926,7 +924,8 @@ adapted from the official code repo [FramePack](https://github.com/lllyasviel/Fr
|
|
| 926 |
t2v = gr.Checkbox(label="Do text-to-video (ignored for video extension)", value=False)
|
| 927 |
|
| 928 |
with gr.Row():
|
| 929 |
-
start_button = gr.Button(value="
|
|
|
|
| 930 |
end_button = gr.Button(value="End Generation", variant="stop", interactive=False)
|
| 931 |
|
| 932 |
total_second_length = gr.Slider(label="Video Length to Generate (seconds)", minimum=1, maximum=120, value=2, step=0.1)
|
|
@@ -984,6 +983,7 @@ adapted from the official code repo [FramePack](https://github.com/lllyasviel/Fr
|
|
| 984 |
ips = [input_image, prompt, t2v, n_prompt, randomize_seed, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf]
|
| 985 |
ips_video = [input_video, prompt, n_prompt, randomize_seed, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch]
|
| 986 |
start_button.click(fn=process, inputs=ips, outputs=[result_video, preview_image, progress_desc, progress_bar, start_button, end_button])
|
|
|
|
| 987 |
end_button.click(fn=end_process)
|
| 988 |
|
| 989 |
with gr.Row(elem_id="image_examples", visible=False):
|
|
@@ -1093,7 +1093,7 @@ adapted from the official code repo [FramePack](https://github.com/lllyasviel/Fr
|
|
| 1093 |
run_on_click = True,
|
| 1094 |
fn = process_video,
|
| 1095 |
inputs = ips_video,
|
| 1096 |
-
outputs = [result_video, preview_image, progress_desc, progress_bar,
|
| 1097 |
cache_examples = True,
|
| 1098 |
)
|
| 1099 |
|
|
@@ -1108,7 +1108,7 @@ adapted from the official code repo [FramePack](https://github.com/lllyasviel/Fr
|
|
| 1108 |
prompt_debug_value = prompt_debug_data
|
| 1109 |
total_second_length_debug_value = total_second_length_debug_data
|
| 1110 |
return []
|
| 1111 |
-
|
| 1112 |
input_image_debug.upload(
|
| 1113 |
fn=handle_field_debug_change,
|
| 1114 |
inputs=[input_image_debug, input_video_debug, prompt_debug, total_second_length_debug],
|
|
|
|
| 43 |
from diffusers_helper.bucket_tools import find_nearest_bucket
|
| 44 |
from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig, HunyuanVideoTransformer3DModel, HunyuanVideoPipeline
|
| 45 |
|
|
|
|
| 46 |
if torch.cuda.device_count() > 0:
|
| 47 |
free_mem_gb = get_cuda_free_memory_gb(gpu)
|
| 48 |
high_vram = free_mem_gb > 60
|
| 49 |
+
|
| 50 |
print(f'Free VRAM {free_mem_gb} GB')
|
| 51 |
print(f'High-VRAM Mode: {high_vram}')
|
| 52 |
|
|
|
|
| 55 |
tokenizer = LlamaTokenizerFast.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer')
|
| 56 |
tokenizer_2 = CLIPTokenizer.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer_2')
|
| 57 |
vae = AutoencoderKLHunyuanVideo.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='vae', torch_dtype=torch.float16).cpu()
|
| 58 |
+
|
| 59 |
feature_extractor = SiglipImageProcessor.from_pretrained("lllyasviel/flux_redux_bfl", subfolder='feature_extractor')
|
| 60 |
image_encoder = SiglipVisionModel.from_pretrained("lllyasviel/flux_redux_bfl", subfolder='image_encoder', torch_dtype=torch.float16).cpu()
|
| 61 |
+
|
| 62 |
transformer = HunyuanVideoTransformer3DModelPacked.from_pretrained('lllyasviel/FramePack_F1_I2V_HY_20250503', torch_dtype=torch.bfloat16).cpu()
|
| 63 |
+
|
| 64 |
vae.eval()
|
| 65 |
text_encoder.eval()
|
| 66 |
text_encoder_2.eval()
|
| 67 |
image_encoder.eval()
|
| 68 |
transformer.eval()
|
| 69 |
+
|
| 70 |
if not high_vram:
|
| 71 |
vae.enable_slicing()
|
| 72 |
vae.enable_tiling()
|
| 73 |
+
|
| 74 |
transformer.high_quality_fp32_output_for_inference = True
|
| 75 |
print('transformer.high_quality_fp32_output_for_inference = True')
|
| 76 |
+
|
| 77 |
transformer.to(dtype=torch.bfloat16)
|
| 78 |
vae.to(dtype=torch.float16)
|
| 79 |
image_encoder.to(dtype=torch.float16)
|
| 80 |
text_encoder.to(dtype=torch.float16)
|
| 81 |
text_encoder_2.to(dtype=torch.float16)
|
| 82 |
+
|
| 83 |
vae.requires_grad_(False)
|
| 84 |
text_encoder.requires_grad_(False)
|
| 85 |
text_encoder_2.requires_grad_(False)
|
| 86 |
image_encoder.requires_grad_(False)
|
| 87 |
transformer.requires_grad_(False)
|
| 88 |
+
|
| 89 |
if not high_vram:
|
| 90 |
# DynamicSwapInstaller is same as huggingface's enable_sequential_offload but 3x faster
|
| 91 |
DynamicSwapInstaller.install_model(transformer, device=gpu)
|
|
|
|
| 336 |
load_model_as_complete(text_encoder_2, target_device=gpu)
|
| 337 |
|
| 338 |
prompt_parameters = []
|
| 339 |
+
|
| 340 |
for prompt_part in prompts:
|
| 341 |
prompt_parameters.append(encode_prompt(prompt_part, n_prompt))
|
| 342 |
|
|
|
|
| 511 |
|
| 512 |
|
| 513 |
@spaces.GPU(duration=get_duration)
|
| 514 |
+
def process(input_image, prompt,
|
| 515 |
+
t2v=False,
|
| 516 |
n_prompt="",
|
| 517 |
randomize_seed=True,
|
| 518 |
+
seed=31337,
|
| 519 |
+
total_second_length=5,
|
| 520 |
+
latent_window_size=9,
|
| 521 |
+
steps=25,
|
| 522 |
+
cfg=1.0,
|
| 523 |
+
gs=10.0,
|
| 524 |
+
rs=0.0,
|
| 525 |
+
gpu_memory_preservation=6,
|
| 526 |
use_teacache=True,
|
| 527 |
mp4_crf=16
|
| 528 |
):
|
|
|
|
| 894 |
yield output_filename, gr.update(visible=False), desc+' Video complete.', '', gr.update(interactive=True), gr.update(interactive=False)
|
| 895 |
break
|
| 896 |
|
|
|
|
| 897 |
def end_process():
|
| 898 |
stream.input_queue.push('end')
|
| 899 |
|
|
|
|
| 924 |
t2v = gr.Checkbox(label="Do text-to-video (ignored for video extension)", value=False)
|
| 925 |
|
| 926 |
with gr.Row():
|
| 927 |
+
start_button = gr.Button(value="Generate from image", variant="primary")
|
| 928 |
+
start_button_video = gr.Button(value="Generate from video", variant="primary")
|
| 929 |
end_button = gr.Button(value="End Generation", variant="stop", interactive=False)
|
| 930 |
|
| 931 |
total_second_length = gr.Slider(label="Video Length to Generate (seconds)", minimum=1, maximum=120, value=2, step=0.1)
|
|
|
|
| 983 |
ips = [input_image, prompt, t2v, n_prompt, randomize_seed, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf]
|
| 984 |
ips_video = [input_video, prompt, n_prompt, randomize_seed, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch]
|
| 985 |
start_button.click(fn=process, inputs=ips, outputs=[result_video, preview_image, progress_desc, progress_bar, start_button, end_button])
|
| 986 |
+
start_button_video.click(fn=process_video, inputs=ips_video, outputs=[result_video, preview_image, progress_desc, progress_bar, start_button_video, end_button])
|
| 987 |
end_button.click(fn=end_process)
|
| 988 |
|
| 989 |
with gr.Row(elem_id="image_examples", visible=False):
|
|
|
|
| 1093 |
run_on_click = True,
|
| 1094 |
fn = process_video,
|
| 1095 |
inputs = ips_video,
|
| 1096 |
+
outputs = [result_video, preview_image, progress_desc, progress_bar, start_button_video, end_button],
|
| 1097 |
cache_examples = True,
|
| 1098 |
)
|
| 1099 |
|
|
|
|
| 1108 |
prompt_debug_value = prompt_debug_data
|
| 1109 |
total_second_length_debug_value = total_second_length_debug_data
|
| 1110 |
return []
|
| 1111 |
+
|
| 1112 |
input_image_debug.upload(
|
| 1113 |
fn=handle_field_debug_change,
|
| 1114 |
inputs=[input_image_debug, input_video_debug, prompt_debug, total_second_length_debug],
|