FramePack

Build error

App Files Files Community

Fabrice-TIERCELIN commited on Jun 5, 2025

Commit

21d7586

verified ·

1 Parent(s): 2a71b9a

Re-commit the right code

Browse files

Files changed (1) hide show

app.py +108 -67

app.py CHANGED Viewed

@@ -11,7 +11,6 @@ import traceback
 import einops
 import safetensors.torch as sf
 import numpy as np
-import argparse
 import random
 import math
 # 20250506 pftq: Added for video input loading
@@ -397,6 +396,24 @@ def worker(input_image, prompts, n_prompt, seed, total_second_length, latent_win
         history_latents = torch.cat([history_latents, start_latent.to(history_latents)], dim=2)
         total_generated_latent_frames = 1
         for section_index in range(total_latent_sections):
             if stream.input_queue.top() == 'end':
                 stream.output_queue.push(('end', None))
@@ -416,24 +433,6 @@ def worker(input_image, prompts, n_prompt, seed, total_second_length, latent_win
             else:
                 transformer.initialize_teacache(enable_teacache=False)
-            def callback(d):
-                preview = d['denoised']
-                preview = vae_decode_fake(preview)
-                preview = (preview * 255.0).detach().cpu().numpy().clip(0, 255).astype(np.uint8)
-                preview = einops.rearrange(preview, 'b c t h w -> (b h) (t w) c')
-                if stream.input_queue.top() == 'end':
-                    stream.output_queue.push(('end', None))
-                    raise KeyboardInterrupt('User ends the task.')
-                current_step = d['i'] + 1
-                percentage = int(100.0 * current_step / steps)
-                hint = f'Sampling {current_step}/{steps}'
-                desc = f'Total generated frames: {int(max(0, total_generated_latent_frames * 4 - 3))}, Video length: {max(0, (total_generated_latent_frames * 4 - 3) / 30) :.2f} seconds (FPS-30). The video is being extended now ...'
-                stream.output_queue.push(('progress', (preview, desc, make_progress_bar_html(percentage, hint))))
-                return
             indices = torch.arange(0, sum([1, 16, 2, 1, latent_window_size])).unsqueeze(0)
             clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices = indices.split([1, 16, 2, 1, latent_window_size], dim=1)
             clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
@@ -512,7 +511,7 @@ def worker(input_image, prompts, n_prompt, seed, total_second_length, latent_win
     return
 def get_duration(input_image, prompt, generation_mode, n_prompt, randomize_seed, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf):
-    return total_second_length * 60
 @spaces.GPU(duration=get_duration)
@@ -632,6 +631,24 @@ def worker_video(input_video, prompt, n_prompt, seed, batch, resolution, total_s
         total_latent_sections = (total_second_length * fps) / (latent_window_size * 4)
         total_latent_sections = int(max(round(total_latent_sections), 1))
         for idx in range(batch):
             if batch > 1:
                 print(f"Beginning video {idx+1} of {batch} with seed {seed} ")
@@ -671,24 +688,6 @@ def worker_video(input_video, prompt, n_prompt, seed, batch, resolution, total_s
                 else:
                     transformer.initialize_teacache(enable_teacache=False)
-                def callback(d):
-                    preview = d['denoised']
-                    preview = vae_decode_fake(preview)
-                    preview = (preview * 255.0).detach().cpu().numpy().clip(0, 255).astype(np.uint8)
-                    preview = einops.rearrange(preview, 'b c t h w -> (b h) (t w) c')
-                    if stream.input_queue.top() == 'end':
-                        stream.output_queue.push(('end', None))
-                        raise KeyboardInterrupt('User ends the task.')
-                    current_step = d['i'] + 1
-                    percentage = int(100.0 * current_step / steps)
-                    hint = f'Sampling {current_step}/{steps}'
-                    desc = f'Total frames: {int(max(0, total_generated_latent_frames * 4 - 3))}, Video length: {max(0, (total_generated_latent_frames * 4 - 3) / fps) :.2f} seconds (FPS-{fps}), Seed: {seed}, Video {idx+1} of {batch}. The video is generating part {section_index+1} of {total_latent_sections}...'
-                    stream.output_queue.push(('progress', (preview, desc, make_progress_bar_html(percentage, hint))))
-                    return
                 # 20250506 pftq: Use user-specified number of context frames, matching original allocation for num_clean_frames=2
                 available_frames = history_latents.shape[2]  # Number of latent frames
                 max_pixel_frames = min(latent_window_size * 4 - 3, available_frames * 4)  # Cap at available pixel frames
@@ -710,26 +709,32 @@ def worker_video(input_video, prompt, n_prompt, seed, batch, resolution, total_s
                 # 20250506 pftq: Split history_latents dynamically based on available frames
                 fallback_frame_count = 2 # 20250507 pftq: Changed 0 to 2 Edge case for <=1 sec videos
-                context_frames = history_latents[:, :, -total_context_frames:, :, :] if total_context_frames > 0 else history_latents[:, :, :fallback_frame_count, :, :]
                 if total_context_frames > 0:
                     split_sizes = [num_4x_frames, num_2x_frames, effective_clean_frames]
                     split_sizes = [s for s in split_sizes if s > 0]  # Remove zero sizes
                     if split_sizes:
                         splits = context_frames.split(split_sizes, dim=2)
                         split_idx = 0
-                        clean_latents_4x = splits[split_idx] if num_4x_frames > 0 else history_latents[:, :, :fallback_frame_count, :, :]
                         if clean_latents_4x.shape[2] < 2:  # 20250507 pftq: edge case for <=1 sec videos
                             clean_latents_4x = torch.cat([clean_latents_4x, clean_latents_4x[:, :, -1:, :, :]], dim=2)[:, :, :2, :, :]
-                        split_idx += 1 if num_4x_frames > 0 else 0
-                        clean_latents_2x = splits[split_idx] if num_2x_frames > 0 and split_idx < len(splits) else history_latents[:, :, :fallback_frame_count, :, :]
-                        if clean_latents_2x.shape[2] < 2:  # 20250507 pftq: edge case for <=1 sec videos
-                            clean_latents_2x = torch.cat([clean_latents_2x, clean_latents_2x[:, :, -1:, :, :]], dim=2)[:, :, :2, :, :]
-                        split_idx += 1 if num_2x_frames > 0 else 0
-                        clean_latents_1x = splits[split_idx] if effective_clean_frames > 0 and split_idx < len(splits) else history_latents[:, :, :fallback_frame_count, :, :]
-                    else:
-                        clean_latents_4x = clean_latents_2x = clean_latents_1x = history_latents[:, :, :fallback_frame_count, :, :]
-                else:
-                    clean_latents_4x = clean_latents_2x = clean_latents_1x = history_latents[:, :, :fallback_frame_count, :, :]
                 clean_latents = torch.cat([start_latent.to(history_latents), clean_latents_1x], dim=2)
@@ -781,11 +786,6 @@ def worker_video(input_video, prompt, n_prompt, seed, batch, resolution, total_s
                   section_latent_frames = latent_window_size * 2
                   overlapped_frames = min(latent_window_size * 4 - 3, history_pixels.shape[2])
-                  #if section_index == 0:
-                    #extra_latents = 1  # Add up to 2 extra latent frames for smoother overlap to initial video
-                    #extra_pixel_frames = extra_latents * 4  # Approx. 4 pixel frames per latent
-                    #overlapped_frames = min(overlapped_frames + extra_pixel_frames, history_pixels.shape[2], section_latent_frames * 4)
                   current_pixels = vae_decode(real_history_latents[:, :, -section_latent_frames:], vae).cpu()
                   history_pixels = soft_append_bcthw(history_pixels, current_pixels, overlapped_frames)
@@ -828,12 +828,12 @@ def worker_video(input_video, prompt, n_prompt, seed, batch, resolution, total_s
     return
 def get_duration_video(input_video, prompt, n_prompt, randomize_seed, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
-    return total_second_length * 60 * 2
 # 20250506 pftq: Modified process to pass clean frame count, etc from video_encode
 @spaces.GPU(duration=get_duration_video)
 def process_video(input_video, prompt, n_prompt, randomize_seed, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
-    global stream
     if torch.cuda.device_count() == 0:
         gr.Warning('Set this space to GPU config to make it work.')
@@ -886,19 +886,27 @@ def process_video(input_video, prompt, n_prompt, randomize_seed, seed, batch, re
 def end_process():
     stream.input_queue.push('end')
 timed_prompts = {}
 def handle_prompt_number_change():
-    timed_prompts = {}
     return []
 def handle_timed_prompt_change(timed_prompt_id, timed_prompt):
     timed_prompts[timed_prompt_id] = timed_prompt
     dict_values = {k: v for k, v in timed_prompts.items()}
     sorted_dict_values = sorted(dict_values.items(), key=lambda x: x[0])
     array = []
     for sorted_dict_value in sorted_dict_values:
-        array.append(sorted_dict_value[1])
     print(str(array))
     return ";".join(array)
@@ -929,19 +937,19 @@ with block:
             text_to_video_hint = gr.HTML("I discourage to use the Text-to-Video feature. You should rather generate an image with Flux and use Image-to-Video. You will save time.", visible=False)
             input_image = gr.Image(sources='upload', type="numpy", label="Image", height=320)
             input_video = gr.Video(sources='upload', label="Input Video", height=320, visible=False)
-            prompt = gr.Textbox(label="Prompt", value='', info='Use ; to separate in time', placeholder="The creature starts to move, fast motion, focus motion, consistent arm, consistent position, fixed camera")
             prompt_number = gr.Slider(label="Timed prompt number", minimum=0, maximum=1000, value=0, step=1, info='Not for video extension')
             prompt_number.change(fn=handle_prompt_number_change, inputs=[], outputs=[])
             @gr.render(inputs=prompt_number)
             def show_split(prompt_number):
-                timed_prompts = {}
                 for digit in range(prompt_number):
                     timed_prompt_id = gr.Textbox(value="timed_prompt_" + str(digit), visible=False)
                     timed_prompt = gr.Textbox(label="Timed prompt #" + str(digit + 1), elem_id="timed_prompt_" + str(digit), value="")
-                    timed_prompt.change(fn=handle_timed_prompt_change, inputs=[timed_prompt_id, timed_prompt], outputs=[prompt])
             total_second_length = gr.Slider(label="Video Length to Generate (seconds)", minimum=1, maximum=120, value=2, step=0.1)
             with gr.Row():
@@ -993,8 +1001,8 @@ with block:
             progress_bar = gr.HTML('', elem_classes='no-generating-animation')
     # 20250506 pftq: Updated inputs to include num_clean_frames
-    ips = [input_image, prompt, generation_mode, n_prompt, randomize_seed, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf]
-    ips_video = [input_video, prompt, n_prompt, randomize_seed, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch]
     start_button.click(fn = check_parameters, inputs = [
         generation_mode, input_image, input_video
@@ -1039,7 +1047,41 @@ with block:
                     6, # gpu_memory_preservation
                     False, # use_teacache
                     16 # mp4_crf
-                ]
             ],
         run_on_click = True,
         fn = process,
@@ -1088,7 +1130,6 @@ with block:
         elif generation_mode_data == "video":
             return [gr.update(visible = False), gr.update(visible = False), gr.update(visible = True), gr.update(visible = False), gr.update(visible = True)]
     generation_mode.change(
         fn=handle_generation_mode_change,
         inputs=[generation_mode],

 import einops
 import safetensors.torch as sf
 import numpy as np
 import random
 import math
 # 20250506 pftq: Added for video input loading
         history_latents = torch.cat([history_latents, start_latent.to(history_latents)], dim=2)
         total_generated_latent_frames = 1
+        def callback(d):
+            preview = d['denoised']
+            preview = vae_decode_fake(preview)
+            preview = (preview * 255.0).detach().cpu().numpy().clip(0, 255).astype(np.uint8)
+            preview = einops.rearrange(preview, 'b c t h w -> (b h) (t w) c')
+            if stream.input_queue.top() == 'end':
+                stream.output_queue.push(('end', None))
+                raise KeyboardInterrupt('User ends the task.')
+            current_step = d['i'] + 1
+            percentage = int(100.0 * current_step / steps)
+            hint = f'Sampling {current_step}/{steps}'
+            desc = f'Total generated frames: {int(max(0, total_generated_latent_frames * 4 - 3))}, Video length: {max(0, (total_generated_latent_frames * 4 - 3) / 30) :.2f} seconds (FPS-30). The video is being extended now ...'
+            stream.output_queue.push(('progress', (preview, desc, make_progress_bar_html(percentage, hint))))
+            return
         for section_index in range(total_latent_sections):
             if stream.input_queue.top() == 'end':
                 stream.output_queue.push(('end', None))
             else:
                 transformer.initialize_teacache(enable_teacache=False)
             indices = torch.arange(0, sum([1, 16, 2, 1, latent_window_size])).unsqueeze(0)
             clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices = indices.split([1, 16, 2, 1, latent_window_size], dim=1)
             clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
     return
 def get_duration(input_image, prompt, generation_mode, n_prompt, randomize_seed, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf):
+    return total_second_length * 60 * (0.7 if use_teacache else 1.3)
 @spaces.GPU(duration=get_duration)
         total_latent_sections = (total_second_length * fps) / (latent_window_size * 4)
         total_latent_sections = int(max(round(total_latent_sections), 1))
+        def callback(d):
+            preview = d['denoised']
+            preview = vae_decode_fake(preview)
+            preview = (preview * 255.0).detach().cpu().numpy().clip(0, 255).astype(np.uint8)
+            preview = einops.rearrange(preview, 'b c t h w -> (b h) (t w) c')
+            if stream.input_queue.top() == 'end':
+                stream.output_queue.push(('end', None))
+                raise KeyboardInterrupt('User ends the task.')
+            current_step = d['i'] + 1
+            percentage = int(100.0 * current_step / steps)
+            hint = f'Sampling {current_step}/{steps}'
+            desc = f'Total frames: {int(max(0, total_generated_latent_frames * 4 - 3))}, Video length: {max(0, (total_generated_latent_frames * 4 - 3) / fps) :.2f} seconds (FPS-{fps}), Seed: {seed}, Video {idx+1} of {batch}. The video is generating part {section_index+1} of {total_latent_sections}...'
+            stream.output_queue.push(('progress', (preview, desc, make_progress_bar_html(percentage, hint))))
+            return
         for idx in range(batch):
             if batch > 1:
                 print(f"Beginning video {idx+1} of {batch} with seed {seed} ")
                 else:
                     transformer.initialize_teacache(enable_teacache=False)
                 # 20250506 pftq: Use user-specified number of context frames, matching original allocation for num_clean_frames=2
                 available_frames = history_latents.shape[2]  # Number of latent frames
                 max_pixel_frames = min(latent_window_size * 4 - 3, available_frames * 4)  # Cap at available pixel frames
                 # 20250506 pftq: Split history_latents dynamically based on available frames
                 fallback_frame_count = 2 # 20250507 pftq: Changed 0 to 2 Edge case for <=1 sec videos
+                context_frames = clean_latents_4x = clean_latents_2x = clean_latents_1x = history_latents[:, :, :fallback_frame_count, :, :]
                 if total_context_frames > 0:
+                    context_frames = history_latents[:, :, -total_context_frames:, :, :]
                     split_sizes = [num_4x_frames, num_2x_frames, effective_clean_frames]
                     split_sizes = [s for s in split_sizes if s > 0]  # Remove zero sizes
                     if split_sizes:
                         splits = context_frames.split(split_sizes, dim=2)
                         split_idx = 0
+                        if num_4x_frames > 0:
+                            clean_latents_4x = splits[split_idx]
+                            split_idx = 1
                         if clean_latents_4x.shape[2] < 2:  # 20250507 pftq: edge case for <=1 sec videos
                             clean_latents_4x = torch.cat([clean_latents_4x, clean_latents_4x[:, :, -1:, :, :]], dim=2)[:, :, :2, :, :]
+                        if num_2x_frames > 0 and split_idx < len(splits):
+                            clean_latents_2x = splits[split_idx]
+                            if clean_latents_2x.shape[2] < 2:  # 20250507 pftq: edge case for <=1 sec videos
+                                clean_latents_2x = torch.cat([clean_latents_2x, clean_latents_2x[:, :, -1:, :, :]], dim=2)[:, :, :2, :, :]
+                            split_idx += 1
+                        elif clean_latents_2x.shape[2] < 2:  # 20250507 pftq: edge case for <=1 sec videos
+                            clean_latents_2x = clean_latents_4x
+                        if effective_clean_frames > 0 and split_idx < len(splits):
+                            clean_latents_1x = splits[split_idx]
                 clean_latents = torch.cat([start_latent.to(history_latents), clean_latents_1x], dim=2)
                   section_latent_frames = latent_window_size * 2
                   overlapped_frames = min(latent_window_size * 4 - 3, history_pixels.shape[2])
                   current_pixels = vae_decode(real_history_latents[:, :, -section_latent_frames:], vae).cpu()
                   history_pixels = soft_append_bcthw(history_pixels, current_pixels, overlapped_frames)
     return
 def get_duration_video(input_video, prompt, n_prompt, randomize_seed, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
+    return total_second_length * 60 * (0.7 if use_teacache else 2)
 # 20250506 pftq: Modified process to pass clean frame count, etc from video_encode
 @spaces.GPU(duration=get_duration_video)
 def process_video(input_video, prompt, n_prompt, randomize_seed, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
+    global stream, high_vram
     if torch.cuda.device_count() == 0:
         gr.Warning('Set this space to GPU config to make it work.')
 def end_process():
     stream.input_queue.push('end')
+timeless_prompt_value = [""]
 timed_prompts = {}
 def handle_prompt_number_change():
+    timed_prompts.clear()
     return []
+def handle_timeless_prompt_change(timeless_prompt):
+    timeless_prompt_value[0] = timeless_prompt
+    return refresh_prompt()
 def handle_timed_prompt_change(timed_prompt_id, timed_prompt):
     timed_prompts[timed_prompt_id] = timed_prompt
+    return refresh_prompt()
+def refresh_prompt():
     dict_values = {k: v for k, v in timed_prompts.items()}
     sorted_dict_values = sorted(dict_values.items(), key=lambda x: x[0])
     array = []
     for sorted_dict_value in sorted_dict_values:
+        array.append(timeless_prompt_value[0] + ". " + sorted_dict_value[1])
     print(str(array))
     return ";".join(array)
             text_to_video_hint = gr.HTML("I discourage to use the Text-to-Video feature. You should rather generate an image with Flux and use Image-to-Video. You will save time.", visible=False)
             input_image = gr.Image(sources='upload', type="numpy", label="Image", height=320)
             input_video = gr.Video(sources='upload', label="Input Video", height=320, visible=False)
+            timeless_prompt = gr.Textbox(label="Timeless prompt", info='Used on the whole duration of the generation', value='', placeholder="The creature starts to move, fast motion, focus motion, consistent arm, consistent position, fixed camera")
             prompt_number = gr.Slider(label="Timed prompt number", minimum=0, maximum=1000, value=0, step=1, info='Not for video extension')
             prompt_number.change(fn=handle_prompt_number_change, inputs=[], outputs=[])
             @gr.render(inputs=prompt_number)
             def show_split(prompt_number):
                 for digit in range(prompt_number):
                     timed_prompt_id = gr.Textbox(value="timed_prompt_" + str(digit), visible=False)
                     timed_prompt = gr.Textbox(label="Timed prompt #" + str(digit + 1), elem_id="timed_prompt_" + str(digit), value="")
+                    timed_prompt.change(fn=handle_timed_prompt_change, inputs=[timed_prompt_id, timed_prompt], outputs=[final_prompt])
+            final_prompt = gr.Textbox(label="Final prompt", value='', info='Use ; to separate in time')
+            timeless_prompt.change(fn=handle_timeless_prompt_change, inputs=[timeless_prompt], outputs=[final_prompt])
             total_second_length = gr.Slider(label="Video Length to Generate (seconds)", minimum=1, maximum=120, value=2, step=0.1)
             with gr.Row():
             progress_bar = gr.HTML('', elem_classes='no-generating-animation')
     # 20250506 pftq: Updated inputs to include num_clean_frames
+    ips = [input_image, final_prompt, generation_mode, n_prompt, randomize_seed, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf]
+    ips_video = [input_video, final_prompt, n_prompt, randomize_seed, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch]
     start_button.click(fn = check_parameters, inputs = [
         generation_mode, input_image, input_video
                     6, # gpu_memory_preservation
                     False, # use_teacache
                     16 # mp4_crf
+                ],
+                [
+                    "./img_examples/Example1.png", # input_image
+                    "We are sinking, photorealistic, realistic, intricate details, 8k, insanely detailed",
+                    "image", # generation_mode
+                    "Missing arm, unrealistic position, blurred, blurry", # n_prompt
+                    True, # randomize_seed
+                    42, # seed
+                    1, # total_second_length
+                    9, # latent_window_size
+                    25, # steps
+                    1.0, # cfg
+                    10.0, # gs
+                    0.0, # rs
+                    6, # gpu_memory_preservation
+                    False, # use_teacache
+                    16 # mp4_crf
+                ],
+                [
+                    "./img_examples/Example1.png", # input_image
+                    "A boat is passing, photorealistic, realistic, intricate details, 8k, insanely detailed",
+                    "image", # generation_mode
+                    "Missing arm, unrealistic position, blurred, blurry", # n_prompt
+                    True, # randomize_seed
+                    42, # seed
+                    1, # total_second_length
+                    9, # latent_window_size
+                    25, # steps
+                    1.0, # cfg
+                    10.0, # gs
+                    0.0, # rs
+                    6, # gpu_memory_preservation
+                    False, # use_teacache
+                    16 # mp4_crf
+                ],
             ],
         run_on_click = True,
         fn = process,
         elif generation_mode_data == "video":
             return [gr.update(visible = False), gr.update(visible = False), gr.update(visible = True), gr.update(visible = False), gr.update(visible = True)]
     generation_mode.change(
         fn=handle_generation_mode_change,
         inputs=[generation_mode],