FramePack

Build error

App Files Files Community

Fabrice-TIERCELIN commited on Jun 9, 2025

Commit

27a6551

verified ·

1 Parent(s): d24caab

New features

Browse files

Files changed (1) hide show

app.py +68 -83

app.py CHANGED Viewed

@@ -42,6 +42,9 @@ from transformers import SiglipImageProcessor, SiglipVisionModel
 from diffusers_helper.clip_vision import hf_clip_vision_encode
 from diffusers_helper.bucket_tools import find_nearest_bucket
 from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig, HunyuanVideoTransformer3DModel, HunyuanVideoPipeline
 high_vram = False
 free_mem_gb = 0
@@ -110,7 +113,7 @@ def check_parameters(generation_mode, input_image, input_video):
         raise gr.Error("Please provide an image to extend.")
     if generation_mode == "video" and input_video is None:
         raise gr.Error("Please provide a video to extend.")
-    return []
 @spaces.GPU()
 @torch.no_grad()
@@ -414,6 +417,10 @@ def worker(input_image, prompts, n_prompt, seed, total_second_length, latent_win
             stream.output_queue.push(('progress', (preview, desc, make_progress_bar_html(percentage, hint))))
             return
         for section_index in range(total_latent_sections):
             if stream.input_queue.top() == 'end':
                 stream.output_queue.push(('end', None))
@@ -433,10 +440,6 @@ def worker(input_image, prompts, n_prompt, seed, total_second_length, latent_win
             else:
                 transformer.initialize_teacache(enable_teacache=False)
-            indices = torch.arange(0, sum([1, 16, 2, 1, latent_window_size])).unsqueeze(0)
-            clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices = indices.split([1, 16, 2, 1, latent_window_size], dim=1)
-            clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
             clean_latents_4x, clean_latents_2x, clean_latents_1x = history_latents[:, :, -sum([16, 2, 1]):, :, :].split([16, 2, 1], dim=2)
             clean_latents = torch.cat([start_latent.to(history_latents), clean_latents_1x], dim=2)
@@ -567,13 +570,28 @@ def process(input_image, prompt,
             yield gr.update(), gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True)
         if flag == 'end':
-            yield output_filename, gr.update(visible=False), gr.update(), '', gr.update(interactive=True), gr.update(interactive=False)
-            break
 # 20250506 pftq: Modified worker to accept video input and clean frame count
 @spaces.GPU()
 @torch.no_grad()
-def worker_video(input_video, prompt, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
     stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
@@ -591,15 +609,10 @@ def worker_video(input_video, prompt, n_prompt, seed, batch, resolution, total_s
             fake_diffusers_current_device(text_encoder, gpu)  # since we only encode one text - that is one model move and one encode, offload is same time consumption since it is also one load and one encode.
             load_model_as_complete(text_encoder_2, target_device=gpu)
-        llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
-        if cfg == 1:
-            llama_vec_n, clip_l_pooler_n = torch.zeros_like(llama_vec), torch.zeros_like(clip_l_pooler)
-        else:
-            llama_vec_n, clip_l_pooler_n = encode_prompt_conds(n_prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
-        llama_vec, llama_attention_mask = crop_or_pad_yield_mask(llama_vec, length=512)
-        llama_vec_n, llama_attention_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512)
         # 20250506 pftq: Processing input video instead of image
         stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Video processing ...'))))
@@ -622,10 +635,6 @@ def worker_video(input_video, prompt, n_prompt, seed, batch, resolution, total_s
         image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
         # Dtype
-        llama_vec = llama_vec.to(transformer.dtype)
-        llama_vec_n = llama_vec_n.to(transformer.dtype)
-        clip_l_pooler = clip_l_pooler.to(transformer.dtype)
-        clip_l_pooler_n = clip_l_pooler_n.to(transformer.dtype)
         image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
         total_latent_sections = (total_second_length * fps) / (latent_window_size * 4)
@@ -679,6 +688,9 @@ def worker_video(input_video, prompt, n_prompt, seed, batch, resolution, total_s
                 print(f'section_index = {section_index}, total_latent_sections = {total_latent_sections}')
                 if not high_vram:
                     unload_complete_models()
                     move_model_to_device_with_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=gpu_memory_preservation)
@@ -723,12 +735,12 @@ def worker_video(input_video, prompt, n_prompt, seed, batch, resolution, total_s
                             clean_latents_4x = splits[split_idx]
                             split_idx = 1
                         if clean_latents_4x.shape[2] < 2:  # 20250507 pftq: edge case for <=1 sec videos
-                            clean_latents_4x = torch.cat([clean_latents_4x, clean_latents_4x[:, :, -1:, :, :]], dim=2)[:, :, :2, :, :]
                         if num_2x_frames > 0 and split_idx < len(splits):
                             clean_latents_2x = splits[split_idx]
                             if clean_latents_2x.shape[2] < 2:  # 20250507 pftq: edge case for <=1 sec videos
-                                clean_latents_2x = torch.cat([clean_latents_2x, clean_latents_2x[:, :, -1:, :, :]], dim=2)[:, :, :2, :, :]
                             split_idx += 1
                         elif clean_latents_2x.shape[2] < 2:  # 20250507 pftq: edge case for <=1 sec videos
                             clean_latents_2x = clean_latents_4x
@@ -798,7 +810,7 @@ def worker_video(input_video, prompt, n_prompt, seed, batch, resolution, total_s
                 save_bcthw_as_mp4(history_pixels, output_filename, fps=fps, crf=mp4_crf)
                 print(f"Latest video saved: {output_filename}")
                 # 20250508 pftq: Save prompt to mp4 metadata comments
-                set_mp4_comments_imageio_ffmpeg(output_filename, f"Prompt: {prompt} | Negative Prompt: {n_prompt}");
                 print(f"Prompt saved to mp4 metadata comments: {output_filename}")
                 # 20250506 pftq: Clean up previous partial files
@@ -842,6 +854,8 @@ def process_video(input_video, prompt, n_prompt, randomize_seed, seed, batch, re
     if randomize_seed:
         seed = random.randint(0, np.iinfo(np.int32).max)
     # 20250506 pftq: Updated assertion for video input
     assert input_video is not None, 'No input video!'
@@ -863,7 +877,7 @@ def process_video(input_video, prompt, n_prompt, randomize_seed, seed, batch, re
     stream = AsyncStream()
     # 20250506 pftq: Pass num_clean_frames, vae_batch, etc
-    async_run(worker_video, input_video, prompt, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch)
     output_filename = None
@@ -880,8 +894,7 @@ def process_video(input_video, prompt, n_prompt, randomize_seed, seed, batch, re
             yield output_filename, gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True) # 20250506 pftq: Keep refreshing the video in case it got hidden when the tab was in the background
         if flag == 'end':
-            yield output_filename, gr.update(visible=False), desc+' Video complete.', '', gr.update(interactive=True), gr.update(interactive=False)
-            break
 def end_process():
     stream.input_queue.push('end')
@@ -906,7 +919,10 @@ def refresh_prompt():
     sorted_dict_values = sorted(dict_values.items(), key=lambda x: x[0])
     array = []
     for sorted_dict_value in sorted_dict_values:
-        array.append(timeless_prompt_value[0] + ". " + sorted_dict_value[1])
     print(str(array))
     return ";".join(array)
@@ -914,7 +930,6 @@ title_html = """
     <h1><center>FramePack</center></h1>
     <big><center>Generate videos from text/image/video freely, without account, without watermark and download it</center></big>
     <br/>
-    <br/>
     <p>This space is ready to work on ZeroGPU and GPU and has been tested successfully on ZeroGPU. Please leave a <a href="https://huggingface.co/spaces/Fabrice-TIERCELIN/FramePack/discussions/new">message in discussion</a> if you encounter issues.</p>
     """
@@ -933,13 +948,12 @@ with block:
     gr.HTML(title_html)
     with gr.Row():
         with gr.Column():
-            generation_mode = gr.Radio([["Text-to-Video", "text"], ["Image-to-Video", "image"], ["Video-to-Video", "video"]], label="Generation mode", value = "image")
             text_to_video_hint = gr.HTML("I discourage to use the Text-to-Video feature. You should rather generate an image with Flux and use Image-to-Video. You will save time.", visible=False)
             input_image = gr.Image(sources='upload', type="numpy", label="Image", height=320)
             input_video = gr.Video(sources='upload', label="Input Video", height=320, visible=False)
-            timeless_prompt = gr.Textbox(label="Timeless prompt", info='Used on the whole duration of the generation', value='', placeholder="The creature starts to move, fast motion, focus motion, consistent arm, consistent position, fixed camera")
-            prompt_number = gr.Slider(label="Timed prompt number", minimum=0, maximum=1000, value=0, step=1, info='Not for video extension')
-            prompt_number.change(fn=handle_prompt_number_change, inputs=[], outputs=[])
             @gr.render(inputs=prompt_number)
             def show_split(prompt_number):
@@ -949,7 +963,6 @@ with block:
                     timed_prompt.change(fn=handle_timed_prompt_change, inputs=[timed_prompt_id, timed_prompt], outputs=[final_prompt])
             final_prompt = gr.Textbox(label="Final prompt", value='', info='Use ; to separate in time')
-            timeless_prompt.change(fn=handle_timeless_prompt_change, inputs=[timeless_prompt], outputs=[final_prompt])
             total_second_length = gr.Slider(label="Video Length to Generate (seconds)", minimum=1, maximum=120, value=2, step=0.1)
             with gr.Row():
@@ -960,7 +973,7 @@ with block:
             with gr.Accordion("Advanced settings", open=False):
                 with gr.Row():
                     use_teacache = gr.Checkbox(label='Use TeaCache', value=False, info='Faster speed, but often makes hands and fingers slightly worse.')
-                    no_resize = gr.Checkbox(label='Force Original Video Resolution (no Resizing) (only for video extension)', value=False, info='Might run out of VRAM (720p requires > 24GB VRAM).')
                 n_prompt = gr.Textbox(label="Negative Prompt", value="Missing arm, unrealistic position, blurred, blurry", info='Requires using normal CFG (undistilled) instead of Distilled (set Distilled=1 and CFG > 1).')
                 randomize_seed = gr.Checkbox(label='Randomize seed', value=True, info='If checked, the seed is always different')
@@ -968,18 +981,18 @@ with block:
                 latent_window_size = gr.Slider(label="Latent Window Size", minimum=1, maximum=33, value=9, step=1, info='Generate more frames at a time (larger chunks). Less degradation and better blending but higher VRAM cost. Should not change.')
                 steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=25, step=1, info='Increase for more quality, especially if using high non-distilled CFG. Changing this value is not recommended.')
-                batch = gr.Slider(label="Batch Size (Number of Videos)", minimum=1, maximum=1000, value=1, step=1, info='Generate multiple videos each with a different seed (only for video extension).')
-                resolution = gr.Number(label="Resolution (max width or height)", value=640, precision=0, info='Only for video extension')
                 # 20250506 pftq: Reduced default distilled guidance scale to improve adherence to input video
                 cfg = gr.Slider(label="CFG Scale", minimum=1.0, maximum=32.0, value=1.0, step=0.01, info='Use this instead of Distilled for more detail/control + Negative Prompt (make sure Distilled set to 1). Doubles render time. Should not change.')
-                gs = gr.Slider(label="Distilled CFG Scale", minimum=1.0, maximum=32.0, value=10.0, step=0.01, info='Prompt adherence at the cost of less details from the input video, but to a lesser extent than Context Frames; 3=blurred motions& & unsharped, 10=focus motion; changing this value is not recommended')
-                rs = gr.Slider(label="CFG Re-Scale", minimum=0.0, maximum=1.0, value=0.0, step=0.01)  # Should not change
                 # 20250506 pftq: Renamed slider to Number of Context Frames and updated description
-                num_clean_frames = gr.Slider(label="Number of Context Frames", minimum=2, maximum=10, value=5, step=1, info="Retain more video details but increase memory use. Reduce to 2 if memory issues (only for video extension).")
                 default_vae = 32
                 if high_vram:
@@ -987,7 +1000,7 @@ with block:
                 elif free_mem_gb>=20:
                     default_vae = 64
-                vae_batch = gr.Slider(label="VAE Batch Size for Input Video", minimum=4, maximum=256, value=default_vae, step=4, info="Reduce if running out of memory. Increase for better quality frames during fast motion (only for video extension).")
                 gpu_memory_preservation = gr.Slider(label="GPU Inference Preserved Memory (GB) (larger means slower)", minimum=6, maximum=128, value=6, step=0.1, info="Set this number to a larger value if you encounter OOM. Larger value causes slower speed.")
@@ -1004,12 +1017,14 @@ with block:
     ips = [input_image, final_prompt, generation_mode, n_prompt, randomize_seed, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf]
     ips_video = [input_video, final_prompt, n_prompt, randomize_seed, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch]
     start_button.click(fn = check_parameters, inputs = [
         generation_mode, input_image, input_video
-    ], outputs = [], queue = False, show_progress = False).success(fn=process, inputs=ips, outputs=[result_video, preview_image, progress_desc, progress_bar, start_button, end_button])
     start_button_video.click(fn = check_parameters, inputs = [
         generation_mode, input_image, input_video
-    ], outputs = [], queue = False, show_progress = False).success(fn=process_video, inputs=ips_video, outputs=[result_video, preview_image, progress_desc, progress_bar, start_button_video, end_button])
     end_button.click(fn=end_process)
     gr.Examples(
@@ -1045,43 +1060,9 @@ with block:
                     10.0, # gs
                     0.0, # rs
                     6, # gpu_memory_preservation
-                    False, # use_teacache
                     16 # mp4_crf
-                ],
-                [
-                    "./img_examples/Example1.png", # input_image
-                    "We are sinking, photorealistic, realistic, intricate details, 8k, insanely detailed",
-                    "image", # generation_mode
-                    "Missing arm, unrealistic position, blurred, blurry", # n_prompt
-                    True, # randomize_seed
-                    42, # seed
-                    1, # total_second_length
-                    9, # latent_window_size
-                    25, # steps
-                    1.0, # cfg
-                    10.0, # gs
-                    0.0, # rs
-                    6, # gpu_memory_preservation
-                    False, # use_teacache
-                    16 # mp4_crf
-                ],
-                [
-                    "./img_examples/Example1.png", # input_image
-                    "A boat is passing, photorealistic, realistic, intricate details, 8k, insanely detailed",
-                    "image", # generation_mode
-                    "Missing arm, unrealistic position, blurred, blurry", # n_prompt
-                    True, # randomize_seed
-                    42, # seed
-                    1, # total_second_length
-                    9, # latent_window_size
-                    25, # steps
-                    1.0, # cfg
-                    10.0, # gs
-                    0.0, # rs
-                    6, # gpu_memory_preservation
-                    False, # use_teacache
-                    16 # mp4_crf
-                ],
             ],
         run_on_click = True,
         fn = process,
@@ -1121,19 +1102,23 @@ with block:
         cache_examples = torch.cuda.device_count() > 0,
     )
     def handle_generation_mode_change(generation_mode_data):
         if generation_mode_data == "text":
-            return [gr.update(visible = True), gr.update(visible = False), gr.update(visible = False), gr.update(visible = True), gr.update(visible = False)]
         elif generation_mode_data == "image":
-            return [gr.update(visible = False), gr.update(visible = True), gr.update(visible = False), gr.update(visible = True), gr.update(visible = False)]
         elif generation_mode_data == "video":
-            return [gr.update(visible = False), gr.update(visible = False), gr.update(visible = True), gr.update(visible = False), gr.update(visible = True)]
     generation_mode.change(
         fn=handle_generation_mode_change,
         inputs=[generation_mode],
-        outputs=[text_to_video_hint, input_image, input_video, start_button, start_button_video]
     )
-block.launch(mcp_server=False, ssr_mode=False)

 from diffusers_helper.clip_vision import hf_clip_vision_encode
 from diffusers_helper.bucket_tools import find_nearest_bucket
 from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig, HunyuanVideoTransformer3DModel, HunyuanVideoPipeline
+import pillow_heif
+pillow_heif.register_heif_opener()
 high_vram = False
 free_mem_gb = 0
         raise gr.Error("Please provide an image to extend.")
     if generation_mode == "video" and input_video is None:
         raise gr.Error("Please provide a video to extend.")
+    return [gr.update(interactive=True)]
 @spaces.GPU()
 @torch.no_grad()
             stream.output_queue.push(('progress', (preview, desc, make_progress_bar_html(percentage, hint))))
             return
+        indices = torch.arange(0, sum([1, 16, 2, 1, latent_window_size])).unsqueeze(0)
+        clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices = indices.split([1, 16, 2, 1, latent_window_size], dim=1)
+        clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
         for section_index in range(total_latent_sections):
             if stream.input_queue.top() == 'end':
                 stream.output_queue.push(('end', None))
             else:
                 transformer.initialize_teacache(enable_teacache=False)
             clean_latents_4x, clean_latents_2x, clean_latents_1x = history_latents[:, :, -sum([16, 2, 1]):, :, :].split([16, 2, 1], dim=2)
             clean_latents = torch.cat([start_latent.to(history_latents), clean_latents_1x], dim=2)
             yield gr.update(), gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True)
         if flag == 'end':
+            return output_filename, gr.update(visible=False), gr.update(), '', gr.update(interactive=True), gr.update(interactive=False)
 # 20250506 pftq: Modified worker to accept video input and clean frame count
 @spaces.GPU()
 @torch.no_grad()
+def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
+    def encode_prompt(prompt, n_prompt):
+        llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
+        if cfg == 1:
+            llama_vec_n, clip_l_pooler_n = torch.zeros_like(llama_vec), torch.zeros_like(clip_l_pooler)
+        else:
+            llama_vec_n, clip_l_pooler_n = encode_prompt_conds(n_prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
+        llama_vec, llama_attention_mask = crop_or_pad_yield_mask(llama_vec, length=512)
+        llama_vec_n, llama_attention_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512)
+        llama_vec = llama_vec.to(transformer.dtype)
+        llama_vec_n = llama_vec_n.to(transformer.dtype)
+        clip_l_pooler = clip_l_pooler.to(transformer.dtype)
+        clip_l_pooler_n = clip_l_pooler_n.to(transformer.dtype)
+        return [llama_vec, clip_l_pooler, llama_vec_n, clip_l_pooler_n, llama_attention_mask, llama_attention_mask_n]
     stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
             fake_diffusers_current_device(text_encoder, gpu)  # since we only encode one text - that is one model move and one encode, offload is same time consumption since it is also one load and one encode.
             load_model_as_complete(text_encoder_2, target_device=gpu)
+        prompt_parameters = []
+        for prompt_part in prompts:
+            prompt_parameters.append(encode_prompt(prompt_part, n_prompt))
         # 20250506 pftq: Processing input video instead of image
         stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Video processing ...'))))
         image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
         # Dtype
         image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
         total_latent_sections = (total_second_length * fps) / (latent_window_size * 4)
                 print(f'section_index = {section_index}, total_latent_sections = {total_latent_sections}')
+                if len(prompt_parameters) > 0:
+                    [llama_vec, clip_l_pooler, llama_vec_n, clip_l_pooler_n, llama_attention_mask, llama_attention_mask_n] = prompt_parameters.pop(0)
                 if not high_vram:
                     unload_complete_models()
                     move_model_to_device_with_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=gpu_memory_preservation)
                             clean_latents_4x = splits[split_idx]
                             split_idx = 1
                         if clean_latents_4x.shape[2] < 2:  # 20250507 pftq: edge case for <=1 sec videos
+                            clean_latents_4x = torch.cat([clean_latents_4x, clean_latents_4x], dim=2)
                         if num_2x_frames > 0 and split_idx < len(splits):
                             clean_latents_2x = splits[split_idx]
                             if clean_latents_2x.shape[2] < 2:  # 20250507 pftq: edge case for <=1 sec videos
+                                clean_latents_2x = torch.cat([clean_latents_2x, clean_latents_2x], dim=2)
                             split_idx += 1
                         elif clean_latents_2x.shape[2] < 2:  # 20250507 pftq: edge case for <=1 sec videos
                             clean_latents_2x = clean_latents_4x
                 save_bcthw_as_mp4(history_pixels, output_filename, fps=fps, crf=mp4_crf)
                 print(f"Latest video saved: {output_filename}")
                 # 20250508 pftq: Save prompt to mp4 metadata comments
+                set_mp4_comments_imageio_ffmpeg(output_filename, f"Prompt: {prompts} | Negative Prompt: {n_prompt}");
                 print(f"Prompt saved to mp4 metadata comments: {output_filename}")
                 # 20250506 pftq: Clean up previous partial files
     if randomize_seed:
         seed = random.randint(0, np.iinfo(np.int32).max)
+    prompts = prompt.split(";")
     # 20250506 pftq: Updated assertion for video input
     assert input_video is not None, 'No input video!'
     stream = AsyncStream()
     # 20250506 pftq: Pass num_clean_frames, vae_batch, etc
+    async_run(worker_video, input_video, prompts, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch)
     output_filename = None
             yield output_filename, gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True) # 20250506 pftq: Keep refreshing the video in case it got hidden when the tab was in the background
         if flag == 'end':
+            return output_filename, gr.update(visible=False), desc+' Video complete.', '', gr.update(interactive=True), gr.update(interactive=False)
 def end_process():
     stream.input_queue.push('end')
     sorted_dict_values = sorted(dict_values.items(), key=lambda x: x[0])
     array = []
     for sorted_dict_value in sorted_dict_values:
+        if timeless_prompt_value[0] is not None and len(timeless_prompt_value[0]) and sorted_dict_value[1] is not None and len(sorted_dict_value[1]):
+            array.append(timeless_prompt_value[0] + ". " + sorted_dict_value[1])
+        else:
+            array.append(timeless_prompt_value[0] + sorted_dict_value[1])
     print(str(array))
     return ";".join(array)
     <h1><center>FramePack</center></h1>
     <big><center>Generate videos from text/image/video freely, without account, without watermark and download it</center></big>
     <br/>
     <p>This space is ready to work on ZeroGPU and GPU and has been tested successfully on ZeroGPU. Please leave a <a href="https://huggingface.co/spaces/Fabrice-TIERCELIN/FramePack/discussions/new">message in discussion</a> if you encounter issues.</p>
     """
     gr.HTML(title_html)
     with gr.Row():
         with gr.Column():
+            generation_mode = gr.Radio([["Text-to-Video", "text"], ["Image-to-Video", "image"], ["Video Extension", "video"]], label="Generation mode", value = "image")
             text_to_video_hint = gr.HTML("I discourage to use the Text-to-Video feature. You should rather generate an image with Flux and use Image-to-Video. You will save time.", visible=False)
             input_image = gr.Image(sources='upload', type="numpy", label="Image", height=320)
             input_video = gr.Video(sources='upload', label="Input Video", height=320, visible=False)
+            timeless_prompt = gr.Textbox(label="Timeless prompt", info='Used on the whole duration of the generation', value='', placeholder="The creature starts to move, fast motion, fixed camera, focus motion, consistent arm, consistent position, mute colors, insanely detailed")
+            prompt_number = gr.Slider(label="Timed prompt number", minimum=0, maximum=1000, value=0, step=1, info='Prompts will automatically appear')
             @gr.render(inputs=prompt_number)
             def show_split(prompt_number):
                     timed_prompt.change(fn=handle_timed_prompt_change, inputs=[timed_prompt_id, timed_prompt], outputs=[final_prompt])
             final_prompt = gr.Textbox(label="Final prompt", value='', info='Use ; to separate in time')
             total_second_length = gr.Slider(label="Video Length to Generate (seconds)", minimum=1, maximum=120, value=2, step=0.1)
             with gr.Row():
             with gr.Accordion("Advanced settings", open=False):
                 with gr.Row():
                     use_teacache = gr.Checkbox(label='Use TeaCache', value=False, info='Faster speed, but often makes hands and fingers slightly worse.')
+                    no_resize = gr.Checkbox(label='Force Original Video Resolution (no Resizing)', value=False, info='Might run out of VRAM (720p requires > 24GB VRAM).', visible=False)
                 n_prompt = gr.Textbox(label="Negative Prompt", value="Missing arm, unrealistic position, blurred, blurry", info='Requires using normal CFG (undistilled) instead of Distilled (set Distilled=1 and CFG > 1).')
                 randomize_seed = gr.Checkbox(label='Randomize seed', value=True, info='If checked, the seed is always different')
                 latent_window_size = gr.Slider(label="Latent Window Size", minimum=1, maximum=33, value=9, step=1, info='Generate more frames at a time (larger chunks). Less degradation and better blending but higher VRAM cost. Should not change.')
                 steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=25, step=1, info='Increase for more quality, especially if using high non-distilled CFG. Changing this value is not recommended.')
+                batch = gr.Slider(label="Batch Size (Number of Videos)", minimum=1, maximum=1000, value=1, step=1, info='Generate multiple videos each with a different seed.', visible=False)
+                resolution = gr.Number(label="Resolution (max width or height)", value=640, precision=0, visible=False)
                 # 20250506 pftq: Reduced default distilled guidance scale to improve adherence to input video
                 cfg = gr.Slider(label="CFG Scale", minimum=1.0, maximum=32.0, value=1.0, step=0.01, info='Use this instead of Distilled for more detail/control + Negative Prompt (make sure Distilled set to 1). Doubles render time. Should not change.')
+                gs = gr.Slider(label="Distilled CFG Scale", minimum=1.0, maximum=32.0, value=10.0, step=0.01, info='Prompt adherence at the cost of less details from the input video, but to a lesser extent than Context Frames; 3=follow the prompt but blurred motions & unsharped, 10=focus motion; changing this value is not recommended')
+                rs = gr.Slider(label="CFG Re-Scale", minimum=0.0, maximum=1.0, value=0.0, step=0.01, info='Should not change')
                 # 20250506 pftq: Renamed slider to Number of Context Frames and updated description
+                num_clean_frames = gr.Slider(label="Number of Context Frames", minimum=2, maximum=10, value=5, step=1, info="Retain more video details but increase memory use. Reduce to 2 to avoid memory issues or to give more weight to the prompt.", visible=False)
                 default_vae = 32
                 if high_vram:
                 elif free_mem_gb>=20:
                     default_vae = 64
+                vae_batch = gr.Slider(label="VAE Batch Size for Input Video", minimum=4, maximum=256, value=default_vae, step=4, info="Reduce if running out of memory. Increase for better quality frames during fast motion.", visible=False)
                 gpu_memory_preservation = gr.Slider(label="GPU Inference Preserved Memory (GB) (larger means slower)", minimum=6, maximum=128, value=6, step=0.1, info="Set this number to a larger value if you encounter OOM. Larger value causes slower speed.")
     ips = [input_image, final_prompt, generation_mode, n_prompt, randomize_seed, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf]
     ips_video = [input_video, final_prompt, n_prompt, randomize_seed, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch]
+    prompt_number.change(fn=handle_prompt_number_change, inputs=[], outputs=[])
+    timeless_prompt.change(fn=handle_timeless_prompt_change, inputs=[timeless_prompt], outputs=[final_prompt])
     start_button.click(fn = check_parameters, inputs = [
         generation_mode, input_image, input_video
+    ], outputs = [end_button], queue = False, show_progress = False).success(fn=process, inputs=ips, outputs=[result_video, preview_image, progress_desc, progress_bar, start_button, end_button])
     start_button_video.click(fn = check_parameters, inputs = [
         generation_mode, input_image, input_video
+    ], outputs = [end_button], queue = False, show_progress = False).success(fn=process_video, inputs=ips_video, outputs=[result_video, preview_image, progress_desc, progress_bar, start_button_video, end_button])
     end_button.click(fn=end_process)
     gr.Examples(
                     10.0, # gs
                     0.0, # rs
                     6, # gpu_memory_preservation
+                    True, # use_teacache
                     16 # mp4_crf
+                ]
             ],
         run_on_click = True,
         fn = process,
         cache_examples = torch.cuda.device_count() > 0,
     )
+    gr.Markdown('''
+    # Guide
+    To make all your generated scenes consistent, you can then apply a face swap on the main character.
+    ''')
     def handle_generation_mode_change(generation_mode_data):
         if generation_mode_data == "text":
+            return [gr.update(visible = True), gr.update(visible = False), gr.update(visible = False), gr.update(visible = True), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False)]
         elif generation_mode_data == "image":
+            return [gr.update(visible = False), gr.update(visible = True), gr.update(visible = False), gr.update(visible = True), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False)]
         elif generation_mode_data == "video":
+            return [gr.update(visible = False), gr.update(visible = False), gr.update(visible = True), gr.update(visible = False), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True)]
     generation_mode.change(
         fn=handle_generation_mode_change,
         inputs=[generation_mode],
+        outputs=[text_to_video_hint, input_image, input_video, start_button, start_button_video, no_resize, batch, resolution, num_clean_frames, vae_batch]
     )
+block.launch(mcp_server=True, ssr_mode=False)