FramePack

Build error

App Files Files Community

Fabrice-TIERCELIN commited on Jul 1, 2025

Commit

c994303

verified ·

1 Parent(s): e6fa5f4

Better allocation estimation

Browse files

Files changed (1) hide show

app.py +24 -15

app.py CHANGED Viewed

@@ -390,9 +390,13 @@ def worker(input_image, image_position, prompts, n_prompt, seed, resolution, tot
             stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'CLIP Vision encoding ...'))))
             if not high_vram:
                 load_model_as_complete(image_encoder, target_device=gpu)
             image_encoder_last_hidden_state = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder).last_hidden_state
             return [start_latent, image_encoder_last_hidden_state]
@@ -468,7 +472,7 @@ def worker(input_image, image_position, prompts, n_prompt, seed, resolution, tot
                     history_pixels = soft_append_bcthw(vae_decode(real_history_latents, vae).cpu(), history_pixels, overlapped_frames)
             if not high_vram:
-                unload_complete_models()
             if enable_preview or section_index == (0 if first_section_index == (total_latent_sections - 1) else (total_latent_sections - 1)):
                 output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
@@ -636,6 +640,11 @@ def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_
             load_model_as_complete(image_encoder, target_device=gpu)
         image_encoder_output = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder)
         image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
         # Dtype
@@ -808,7 +817,7 @@ def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_
                     history_pixels = soft_append_bcthw(history_pixels, vae_decode(real_history_latents, vae).cpu(), overlapped_frames)
                 if not high_vram:
-                    unload_complete_models()
                 if enable_preview or section_index == total_latent_sections - 1:
                     output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
@@ -909,7 +918,7 @@ def process(input_image,
             fps_number=30
            ):
     if auto_allocation:
-        allocation_time = min(total_second_length * 60 * (0.9 if use_teacache else 3.0) * (1 + ((steps - 25) / 25)), 600)
     if torch.cuda.device_count() == 0:
         gr.Warning('Set this space to GPU config to make it work.')
@@ -994,7 +1003,7 @@ def process_video_on_gpu(input_video, prompts, n_prompt, seed, batch, resolution
 def process_video(input_video, prompt, n_prompt, randomize_seed, seed, auto_allocation, allocation_time, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
     global high_vram
     if auto_allocation:
-        allocation_time = min(total_second_length * 60 * (1.5 if use_teacache else 3.0) * (1 + ((steps - 25) / 25)), 600)
     if torch.cuda.device_count() == 0:
         gr.Warning('Set this space to GPU config to make it work.')
@@ -1066,7 +1075,7 @@ title_html = """
 js = """
 function createGradioAnimation() {
-    window.addEventListener("beforeunload", function (e) {
         if (document.getElementById('end-button') && !document.getElementById('end-button').disabled) {
             var confirmationMessage = 'A process is still running. '
                                     + 'If you leave before saving, your changes will be lost.';
@@ -1095,7 +1104,7 @@ with block:
     with gr.Row():
         with gr.Column():
             generation_mode = gr.Radio([["Text-to-Video", "text"], ["Image-to-Video", "image"], ["Video Extension", "video"]], elem_id="generation-mode", label="Generation mode", value = "image")
-            text_to_video_hint = gr.HTML("Text-to-Video badly works. I discourage to use the Text-to-Video feature. You should rather generate an image with Flux and use Image-to-Video. You will save time.")
             input_image = gr.Image(sources='upload', type="numpy", label="Image", height=320)
             image_position = gr.Slider(label="Image position", minimum=0, maximum=100, value=0, step=1, info='0=Video start; 100=Video end (lower quality)')
             input_video = gr.Video(sources='upload', label="Input Video", height=320)
@@ -1122,7 +1131,7 @@ with block:
                 enable_preview = gr.Checkbox(label='Enable preview', value=True, info='Display a preview around each second generated but it costs 2 sec. for each second generated.')
                 use_teacache = gr.Checkbox(label='Use TeaCache', value=False, info='Faster speed and no break in brightness, but often makes hands and fingers slightly worse.')
-                n_prompt = gr.Textbox(label="Negative Prompt", value="Missing arm, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", info='Requires using normal CFG (undistilled) instead of Distilled (set Distilled=1 and CFG > 1).')
                 fps_number = gr.Slider(label="Frame per seconds", info="The model is trained for 30 fps so other fps may generate weird results", minimum=10, maximum=60, value=30, step=1)
@@ -1171,7 +1180,7 @@ with block:
                     allocation_time = gr.Slider(label="GPU allocation time (in seconds)", info='lower=May abort run, higher=Quota penalty for next runs; only useful for ZeroGPU; for instance set to 88 when you have the message "You have exceeded your GPU quota (180s requested vs. 89s left)."', value=180, minimum=60, maximum=320, step=1)
         with gr.Column():
-            warning = gr.HTML(value = "<center><big>Your computer must <u>not</u> enter into standby mode.</big><br/>On Chrome, you can force to keep a tab alive in <code>chrome://discards/</code></center>", visible = False)
             result_video = gr.Video(label="Generated Frames", autoplay=True, show_share_button=False, height=512, loop=True)
             preview_image = gr.Image(label="Next Latents", height=200, visible=False)
             progress_desc = gr.Markdown('', elem_classes='no-generating-animation')
@@ -1189,7 +1198,7 @@ with block:
                     0, # image_position
                     "Overcrowed street in Japan, photorealistic, realistic, intricate details, 8k, insanely detailed",
                     "text", # generation_mode
-                    "Missing arm, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
                     True, # randomize_seed
                     42, # seed
                     True, # auto_allocation
@@ -1223,7 +1232,7 @@ with block:
                     0, # image_position
                     "A dolphin emerges from the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
                     "image", # generation_mode
-                    "Missing arm, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
                     True, # randomize_seed
                     42, # seed
                     True, # auto_allocation
@@ -1246,7 +1255,7 @@ with block:
                     0, # image_position
                     "A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The man talks and the woman listens; A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks, the man stops talking and the man listens; A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks and the man listens",
                     "image", # generation_mode
-                    "Missing arm, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
                     True, # randomize_seed
                     42, # seed
                     True, # auto_allocation
@@ -1269,7 +1278,7 @@ with block:
                     0, # image_position
                     "A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks and the man listens; A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The man talks, the woman stops talking and the woman listens A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The man talks and the woman listens",
                     "image", # generation_mode
-                    "Missing arm, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
                     True, # randomize_seed
                     42, # seed
                     True, # auto_allocation
@@ -1292,7 +1301,7 @@ with block:
                     0, # image_position
                     "A boy is walking to the right, full view, full-length view, cartoon",
                     "image", # generation_mode
-                    "Missing arm, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
                     True, # randomize_seed
                     42, # seed
                     True, # auto_allocation
@@ -1315,7 +1324,7 @@ with block:
                     100, # image_position
                     "A building starting to explode, photorealistic, realisitc, 8k, insanely detailed",
                     "image", # generation_mode
-                    "Missing arm, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
                     True, # randomize_seed
                     42, # seed
                     True, # auto_allocation
@@ -1347,7 +1356,7 @@ with block:
                 [
                     "./img_examples/Example1.mp4", # input_video
                     "View of the sea as far as the eye can see, from the seaside, a piece of land is barely visible on the horizon at the middle, the sky is radiant, reflections of the sun in the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
-                    "Missing arm, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
                     True, # randomize_seed
                     42, # seed
                     True, # auto_allocation

             stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'CLIP Vision encoding ...'))))
             if not high_vram:
+                unload_complete_models(vae)
                 load_model_as_complete(image_encoder, target_device=gpu)
             image_encoder_last_hidden_state = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder).last_hidden_state
+            if not high_vram:
+                unload_complete_models(image_encoder)
             return [start_latent, image_encoder_last_hidden_state]
                     history_pixels = soft_append_bcthw(vae_decode(real_history_latents, vae).cpu(), history_pixels, overlapped_frames)
             if not high_vram:
+                unload_complete_models(text_encoder, text_encoder_2, image_encoder, vae, transformer)
             if enable_preview or section_index == (0 if first_section_index == (total_latent_sections - 1) else (total_latent_sections - 1)):
                 output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
             load_model_as_complete(image_encoder, target_device=gpu)
         image_encoder_output = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder)
+        # Clean GPU
+        if not high_vram:
+            unload_complete_models(image_encoder)
         image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
         # Dtype
                     history_pixels = soft_append_bcthw(history_pixels, vae_decode(real_history_latents, vae).cpu(), overlapped_frames)
                 if not high_vram:
+                    unload_complete_models(text_encoder, text_encoder_2, image_encoder, vae, transformer)
                 if enable_preview or section_index == total_latent_sections - 1:
                     output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
             fps_number=30
            ):
     if auto_allocation:
+        allocation_time = min(total_second_length * 60 * (1.5 if use_teacache else 3.0) * (1 + ((steps - 25) / 25)), 600)
     if torch.cuda.device_count() == 0:
         gr.Warning('Set this space to GPU config to make it work.')
 def process_video(input_video, prompt, n_prompt, randomize_seed, seed, auto_allocation, allocation_time, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
     global high_vram
     if auto_allocation:
+        allocation_time = min(total_second_length * 60 * (2.5 if use_teacache else 3.5) * (1 + ((steps - 25) / 25)), 600)
     if torch.cuda.device_count() == 0:
         gr.Warning('Set this space to GPU config to make it work.')
 js = """
 function createGradioAnimation() {
+    window.addEventListener("beforeunload", function(e) {
         if (document.getElementById('end-button') && !document.getElementById('end-button').disabled) {
             var confirmationMessage = 'A process is still running. '
                                     + 'If you leave before saving, your changes will be lost.';
     with gr.Row():
         with gr.Column():
             generation_mode = gr.Radio([["Text-to-Video", "text"], ["Image-to-Video", "image"], ["Video Extension", "video"]], elem_id="generation-mode", label="Generation mode", value = "image")
+            text_to_video_hint = gr.HTML("Text-to-Video badly works with a flash effect at the start. I discourage to use the Text-to-Video feature. You should rather generate an image with Flux and use Image-to-Video. You will save time.")
             input_image = gr.Image(sources='upload', type="numpy", label="Image", height=320)
             image_position = gr.Slider(label="Image position", minimum=0, maximum=100, value=0, step=1, info='0=Video start; 100=Video end (lower quality)')
             input_video = gr.Video(sources='upload', label="Input Video", height=320)
                 enable_preview = gr.Checkbox(label='Enable preview', value=True, info='Display a preview around each second generated but it costs 2 sec. for each second generated.')
                 use_teacache = gr.Checkbox(label='Use TeaCache', value=False, info='Faster speed and no break in brightness, but often makes hands and fingers slightly worse.')
+                n_prompt = gr.Textbox(label="Negative Prompt", value="Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", info='Requires using normal CFG (undistilled) instead of Distilled (set Distilled=1 and CFG > 1).')
                 fps_number = gr.Slider(label="Frame per seconds", info="The model is trained for 30 fps so other fps may generate weird results", minimum=10, maximum=60, value=30, step=1)
                     allocation_time = gr.Slider(label="GPU allocation time (in seconds)", info='lower=May abort run, higher=Quota penalty for next runs; only useful for ZeroGPU; for instance set to 88 when you have the message "You have exceeded your GPU quota (180s requested vs. 89s left)."', value=180, minimum=60, maximum=320, step=1)
         with gr.Column():
+            warning = gr.HTML(elem_id="warning", value = "<center><big>Your computer must <u>not</u> enter into standby mode.</big><br/>On Chrome, you can force to keep a tab alive in <code>chrome://discards/</code></center>", visible = False)
             result_video = gr.Video(label="Generated Frames", autoplay=True, show_share_button=False, height=512, loop=True)
             preview_image = gr.Image(label="Next Latents", height=200, visible=False)
             progress_desc = gr.Markdown('', elem_classes='no-generating-animation')
                     0, # image_position
                     "Overcrowed street in Japan, photorealistic, realistic, intricate details, 8k, insanely detailed",
                     "text", # generation_mode
+                    "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
                     True, # randomize_seed
                     42, # seed
                     True, # auto_allocation
                     0, # image_position
                     "A dolphin emerges from the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
                     "image", # generation_mode
+                    "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
                     True, # randomize_seed
                     42, # seed
                     True, # auto_allocation
                     0, # image_position
                     "A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The man talks and the woman listens; A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks, the man stops talking and the man listens; A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks and the man listens",
                     "image", # generation_mode
+                    "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
                     True, # randomize_seed
                     42, # seed
                     True, # auto_allocation
                     0, # image_position
                     "A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks and the man listens; A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The man talks, the woman stops talking and the woman listens A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The man talks and the woman listens",
                     "image", # generation_mode
+                    "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
                     True, # randomize_seed
                     42, # seed
                     True, # auto_allocation
                     0, # image_position
                     "A boy is walking to the right, full view, full-length view, cartoon",
                     "image", # generation_mode
+                    "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
                     True, # randomize_seed
                     42, # seed
                     True, # auto_allocation
                     100, # image_position
                     "A building starting to explode, photorealistic, realisitc, 8k, insanely detailed",
                     "image", # generation_mode
+                    "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
                     True, # randomize_seed
                     42, # seed
                     True, # auto_allocation
                 [
                     "./img_examples/Example1.mp4", # input_video
                     "View of the sea as far as the eye can see, from the seaside, a piece of land is barely visible on the horizon at the middle, the sky is radiant, reflections of the sun in the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
+                    "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
                     True, # randomize_seed
                     42, # seed
                     True, # auto_allocation