FramePack

Build error

App Files Files Community

Fabrice-TIERCELIN commited on Aug 25, 2025

Commit

8261bda

verified ·

1 Parent(s): bb5e9a5

Batch mode

Browse files

Files changed (1) hide show

app.py +384 -350

app.py CHANGED Viewed

@@ -41,7 +41,7 @@ from PIL import Image
 from diffusers import AutoencoderKLHunyuanVideo
 from transformers import LlamaModel, CLIPTextModel, LlamaTokenizerFast, CLIPTokenizer
 from diffusers_helper.hunyuan import encode_prompt_conds, vae_decode, vae_encode, vae_decode_fake
-from diffusers_helper.utils import save_bcthw_as_mp4, crop_or_pad_yield_mask, soft_append_bcthw, resize_and_center_crop, state_dict_weighted_merge, state_dict_offset_merge, generate_timestamp
 from diffusers_helper.models.hunyuan_video_packed import HunyuanVideoTransformer3DModelPacked
 from diffusers_helper.pipelines.k_diffusion_hunyuan import sample_hunyuan
 if torch.cuda.device_count() > 0:
@@ -368,7 +368,7 @@ def image_encode(image_np, target_width, target_height, vae, image_encoder, feat
         raise
 @torch.no_grad()
-def worker(input_image, end_image, image_position, end_stillness, prompts, n_prompt, seed, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number):
     def encode_prompt(prompt, n_prompt):
         llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
@@ -393,8 +393,6 @@ def worker(input_image, end_image, image_position, end_stillness, prompts, n_pro
     section_index = first_section_index
     forward = (image_position == 0)
-    job_id = generate_timestamp()
     stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
     try:
@@ -470,172 +468,179 @@ def worker(input_image, end_image, image_position, end_stillness, prompts, n_pro
         image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
-        # Sampling
-        stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Start sampling ...'))))
-        rnd = torch.Generator("cpu").manual_seed(seed)
-        history_latents = torch.zeros(size=(1, 16, 16 + 2 + 1, height // 8, width // 8), dtype=torch.float32, device=cpu)
-        start_latent = start_latent.to(history_latents)
-        history_pixels = None
-        history_latents = torch.cat([history_latents, start_latent] if forward else [start_latent, history_latents], dim=2)
-        total_generated_latent_frames = 1
-        if enable_preview:
-            def callback(d):
-                preview = d['denoised']
-                preview = vae_decode_fake(preview)
-                preview = (preview * 255.0).detach().cpu().numpy().clip(0, 255).astype(np.uint8)
-                preview = einops.rearrange(preview, 'b c t h w -> (b h) (t w) c')
-                if stream.input_queue.top() == 'end':
-                    stream.output_queue.push(('end', None))
-                    raise KeyboardInterrupt('User ends the task.')
-                current_step = d['i'] + 1
-                percentage = int(100.0 * current_step / steps)
-                hint = f'Sampling {current_step}/{steps}'
-                desc = f'Total generated frames: {int(max(0, total_generated_latent_frames * 4 - 3))}, Video length: {max(0, (total_generated_latent_frames * 4 - 3) / fps_number) :.2f} seconds (FPS-30), Resolution: {height}px * {width}px. The video is being extended now ...'
-                stream.output_queue.push(('progress', (preview, desc, make_progress_bar_html(percentage, hint))))
-                return
-        else:
-            def callback(d):
-                return
-        indices = torch.arange(0, 1 + 16 + 2 + 1 + latent_window_size).unsqueeze(0)
-        if forward:
-            clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices = indices.split([1, 16, 2, 1, latent_window_size], dim=1)
-            clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
-        else:
-            latent_indices, clean_latent_1x_indices, clean_latent_2x_indices, clean_latent_4x_indices, clean_latent_indices_start = indices.split([latent_window_size, 1, 2, 16, 1], dim=1)
-            clean_latent_indices = torch.cat([clean_latent_1x_indices, clean_latent_indices_start], dim=1)
-        def post_process(forward, generated_latents, total_generated_latent_frames, history_latents, high_vram, transformer, gpu, vae, history_pixels, latent_window_size, enable_preview, section_index, total_latent_sections, outputs_folder, mp4_crf, stream):
-            total_generated_latent_frames += int(generated_latents.shape[2])
-            history_latents = torch.cat([history_latents, generated_latents.to(history_latents)] if forward else [generated_latents.to(history_latents), history_latents], dim=2)
-            if not high_vram:
-                offload_model_from_device_for_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=8)
-                load_model_as_complete(vae, target_device=gpu)
-            if history_pixels is None:
-                real_history_latents = history_latents[:, :, -total_generated_latent_frames:, :, :] if forward else history_latents[:, :, :total_generated_latent_frames, :, :]
-                history_pixels = vae_decode(real_history_latents, vae).cpu()
-            else:
-                section_latent_frames = latent_window_size * 2
-                overlapped_frames = latent_window_size * 4 - 3
-                if forward:
-                    real_history_latents = history_latents[:, :, -min(section_latent_frames, total_generated_latent_frames):, :, :]
-                    history_pixels = soft_append_bcthw(history_pixels, vae_decode(real_history_latents, vae).cpu(), overlapped_frames)
-                else:
-                    real_history_latents = history_latents[:, :, :min(section_latent_frames, total_generated_latent_frames), :, :]
-                    history_pixels = soft_append_bcthw(vae_decode(real_history_latents, vae).cpu(), history_pixels, overlapped_frames)
-            if not high_vram:
-                unload_complete_models(text_encoder, text_encoder_2, image_encoder, vae, transformer)
-            if enable_preview or section_index == (0 if first_section_index == (total_latent_sections - 1) else (total_latent_sections - 1)):
-                output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
-                save_bcthw_as_mp4(history_pixels, output_filename, fps=fps_number, crf=mp4_crf)
-                print(f'Decoded. Current latent shape pixel shape {history_pixels.shape}')
-                stream.output_queue.push(('file', output_filename))
-            return [total_generated_latent_frames, history_latents, history_pixels]
-        while section_index < total_latent_sections:
-            if stream.input_queue.top() == 'end':
-                stream.output_queue.push(('end', None))
-                return
-            print(f'section_index = {section_index}, total_latent_sections = {total_latent_sections}')
-            prompt_index = min(section_index, len(prompt_parameters) - 1)
-            [llama_vec, clip_l_pooler, llama_vec_n, clip_l_pooler_n, llama_attention_mask, llama_attention_mask_n] = prompt_parameters[prompt_index]
-            if prompt_index < len(prompt_parameters) - 1 or (prompt_index == total_latent_sections - 1):
-                del prompt_parameters[prompt_index]
-            if not high_vram:
-                unload_complete_models()
-                move_model_to_device_with_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=gpu_memory_preservation)
-            if use_teacache:
-                transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
             else:
-                transformer.initialize_teacache(enable_teacache=False)
             if forward:
-                clean_latents_4x, clean_latents_2x, clean_latents_1x = history_latents[:, :, -(16 + 2 + 1):, :, :].split([16, 2, 1], dim=2)
-                clean_latents = torch.cat([start_latent, clean_latents_1x], dim=2)
             else:
-                clean_latents_1x, clean_latents_2x, clean_latents_4x = history_latents[:, :, :(1 + 2 + 16), :, :].split([1, 2, 16], dim=2)
-                clean_latents = torch.cat([clean_latents_1x, start_latent], dim=2)
-            generated_latents = sample_hunyuan(
-                transformer=transformer,
-                sampler='unipc',
-                width=width,
-                height=height,
-                frames=latent_window_size * 4 - 3,
-                real_guidance_scale=cfg,
-                distilled_guidance_scale=gs,
-                guidance_rescale=rs,
-                # shift=3.0,
-                num_inference_steps=steps,
-                generator=rnd,
-                prompt_embeds=llama_vec,
-                prompt_embeds_mask=llama_attention_mask,
-                prompt_poolers=clip_l_pooler,
-                negative_prompt_embeds=llama_vec_n,
-                negative_prompt_embeds_mask=llama_attention_mask_n,
-                negative_prompt_poolers=clip_l_pooler_n,
-                device=gpu,
-                dtype=torch.bfloat16,
-                image_embeddings=image_encoder_last_hidden_state,
-                latent_indices=latent_indices,
-                clean_latents=clean_latents,
-                clean_latent_indices=clean_latent_indices,
-                clean_latents_2x=clean_latents_2x,
-                clean_latent_2x_indices=clean_latent_2x_indices,
-                clean_latents_4x=clean_latents_4x,
-                clean_latent_4x_indices=clean_latent_4x_indices,
-                callback=callback,
-            )
-            del clean_latents
-            del clean_latents_2x
-            del clean_latents_4x
-            del latent_indices
-            del clean_latent_indices
-            del clean_latent_2x_indices
-            del clean_latent_4x_indices
-            [total_generated_latent_frames, history_latents, history_pixels] = post_process(forward, generated_latents, total_generated_latent_frames, history_latents, high_vram, transformer, gpu, vae, history_pixels, latent_window_size, enable_preview, section_index, total_latent_sections, outputs_folder, mp4_crf, stream)
-            if not forward:
-                if section_index > 0:
-                    section_index -= 1
                 else:
-                    clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices = indices.split([1, 16, 2, 1, latent_window_size], dim=1)
-                    clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
-                    real_history_latents = history_latents[:, :, :total_generated_latent_frames, :, :]
-                    zero_latents = history_latents[:, :, total_generated_latent_frames:, :, :]
-                    history_latents = torch.cat([zero_latents, real_history_latents], dim=2)
-                    del real_history_latents
-                    del zero_latents
-                    forward = True
-                    section_index = first_section_index
-            if forward:
-                section_index += 1
     except:
         traceback.print_exc()
@@ -648,7 +653,7 @@ def worker(input_image, end_image, image_position, end_stillness, prompts, n_pro
     return
 @torch.no_grad()
-def worker_start_end(input_image, end_image, image_position, end_stillness, prompts, n_prompt, seed, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number):
     def encode_prompt(prompt, n_prompt):
         llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
@@ -668,8 +673,7 @@ def worker_start_end(input_image, end_image, image_position, end_stillness, prom
     total_latent_sections = (total_second_length * fps_number) / (latent_window_size * 4)
     total_latent_sections = int(max(round(total_latent_sections), 1))
-    job_id = generate_timestamp()
     stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
@@ -729,9 +733,11 @@ def worker_start_end(input_image, end_image, image_position, end_stillness, prom
                 load_model_as_complete(vae, target_device=gpu)
             start_latent = vae_encode(input_image_pt, vae)
             if has_end_image:
                 end_latent = vae_encode(end_image_pt, vae)
             # CLIP Vision
             stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'CLIP Vision encoding ...'))))
@@ -740,6 +746,7 @@ def worker_start_end(input_image, end_image, image_position, end_stillness, prom
                 load_model_as_complete(image_encoder, target_device=gpu)
             image_encoder_output = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder)
             image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
             if has_end_image:
@@ -763,163 +770,171 @@ def worker_start_end(input_image, end_image, image_position, end_stillness, prom
         # Dtype
         image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
-        # Sampling
-        stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Start sampling ...'))))
-        rnd = torch.Generator("cpu").manual_seed(seed)
-        num_frames = latent_window_size * 4 - 3
-        history_latents = torch.zeros(size=(1, 16, 1 + 2 + 16, height // 8, width // 8), dtype=torch.float32, device=cpu)
-        start_latent = start_latent.to(history_latents)
-        if has_end_image:
-            end_latent = end_latent.to(history_latents)
-        history_pixels = None
-        total_generated_latent_frames = 0
-        if total_latent_sections > 4:
-            # In theory the latent_paddings should follow the else sequence, but it seems that duplicating some
-            # items looks better than expanding it when total_latent_sections > 4
-            # One can try to remove below trick and just
-            # use `latent_paddings = list(reversed(range(total_latent_sections)))` to compare
-            latent_paddings = [3] + [2] * (total_latent_sections - 3) + [1, 0]
-        else:
-            # Convert an iterator to a list
-            latent_paddings = list(range(total_latent_sections - 1, -1, -1))
-        if enable_preview:
-            def callback(d):
-                preview = d['denoised']
-                preview = vae_decode_fake(preview)
-                preview = (preview * 255.0).detach().cpu().numpy().clip(0, 255).astype(np.uint8)
-                preview = einops.rearrange(preview, 'b c t h w -> (b h) (t w) c')
-                if stream.input_queue.top() == 'end':
-                    stream.output_queue.push(('end', None))
-                    raise KeyboardInterrupt('User ends the task.')
-                current_step = d['i'] + 1
-                percentage = int(100.0 * current_step / steps)
-                hint = f'Sampling {current_step}/{steps}'
-                desc = f'Total generated frames: {int(max(0, total_generated_latent_frames * 4 - 3))}, Video length: {max(0, (total_generated_latent_frames * 4 - 3) / fps_number) :.2f} seconds (FPS-30), Resolution: {height}px * {width}px. The video is being extended now ...'
-                stream.output_queue.push(('progress', (preview, desc, make_progress_bar_html(percentage, hint))))
-                return
-        else:
-            def callback(d):
-                return
-        def post_process(job_id, start_latent, generated_latents, total_generated_latent_frames, history_latents, high_vram, transformer, gpu, vae, history_pixels, latent_window_size, enable_preview, outputs_folder, mp4_crf, stream, is_last_section):
-            if is_last_section:
-                generated_latents = torch.cat([start_latent.to(generated_latents), generated_latents], dim=2)
-            total_generated_latent_frames += int(generated_latents.shape[2])
-            history_latents = torch.cat([generated_latents.to(history_latents), history_latents], dim=2)
-            if not high_vram:
-                offload_model_from_device_for_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=8)
-                load_model_as_complete(vae, target_device=gpu)
-            if history_pixels is None:
-                history_pixels = vae_decode(history_latents[:, :, :total_generated_latent_frames, :, :], vae).cpu()
             else:
-                section_latent_frames = (latent_window_size * 2 + 1) if is_last_section else (latent_window_size * 2)
-                overlapped_frames = latent_window_size * 4 - 3
-                current_pixels = vae_decode(history_latents[:, :, :min(total_generated_latent_frames, section_latent_frames)], vae).cpu()
-                history_pixels = soft_append_bcthw(current_pixels, history_pixels, overlapped_frames)
-            if not high_vram:
-                unload_complete_models(vae)
-            if enable_preview or is_last_section:
-                output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
-                save_bcthw_as_mp4(history_pixels, output_filename, fps=fps_number, crf=mp4_crf)
-                print(f'Decoded. Pixel shape {history_pixels.shape}')
-                stream.output_queue.push(('file', output_filename))
-            return [total_generated_latent_frames, history_latents, history_pixels]
-        for latent_padding in latent_paddings:
-            is_last_section = latent_padding == 0
-            is_first_section = latent_padding == latent_paddings[0]
-            latent_padding_size = latent_padding * latent_window_size
-            if stream.input_queue.top() == 'end':
-                stream.output_queue.push(('end', None))
-                return
-            print(f'latent_padding_size = {latent_padding_size}, is_last_section = {is_last_section}, is_first_section = {is_first_section}')
-            if len(prompt_parameters) > 0:
-                [llama_vec, clip_l_pooler, llama_vec_n, clip_l_pooler_n, llama_attention_mask, llama_attention_mask_n] = prompt_parameters.pop(len(prompt_parameters) - 1)
-            indices = torch.arange(1 + latent_padding_size + latent_window_size + 1 + (end_stillness if is_first_section else 0) + 2 + 16).unsqueeze(0)
-            clean_latent_indices_pre, blank_indices, latent_indices, clean_latent_indices_post, clean_latent_2x_indices, clean_latent_4x_indices = indices.split([1, latent_padding_size, latent_window_size, 1 + (end_stillness if is_first_section else 0), 2, 16], dim=1)
-            clean_latent_indices = torch.cat([clean_latent_indices_pre, clean_latent_indices_post], dim=1)
-            clean_latents_post, clean_latents_2x, clean_latents_4x = history_latents[:, :, :1 + 2 + 16, :, :].split([1, 2, 16], dim=2)
-            # Use end image latent for the first section if provided
-            if has_end_image and is_first_section:
-                clean_latents_post = end_latent.expand(-1, -1, 1 + end_stillness, -1, -1)
-            clean_latents = torch.cat([start_latent, clean_latents_post], dim=2)
-            if not high_vram:
-                unload_complete_models()
-                move_model_to_device_with_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=gpu_memory_preservation)
-            if use_teacache:
-                transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
-            else:
-                transformer.initialize_teacache(enable_teacache=False)
-            generated_latents = sample_hunyuan(
-                transformer=transformer,
-                sampler='unipc',
-                width=width,
-                height=height,
-                frames=num_frames,
-                real_guidance_scale=cfg,
-                distilled_guidance_scale=gs,
-                guidance_rescale=rs,
-                # shift=3.0,
-                num_inference_steps=steps,
-                generator=rnd,
-                prompt_embeds=llama_vec,
-                prompt_embeds_mask=llama_attention_mask,
-                prompt_poolers=clip_l_pooler,
-                negative_prompt_embeds=llama_vec_n,
-                negative_prompt_embeds_mask=llama_attention_mask_n,
-                negative_prompt_poolers=clip_l_pooler_n,
-                device=gpu,
-                dtype=torch.bfloat16,
-                image_embeddings=image_encoder_last_hidden_state,
-                latent_indices=latent_indices,
-                clean_latents=clean_latents,
-                clean_latent_indices=clean_latent_indices,
-                clean_latents_2x=clean_latents_2x,
-                clean_latent_2x_indices=clean_latent_2x_indices,
-                clean_latents_4x=clean_latents_4x,
-                clean_latent_4x_indices=clean_latent_4x_indices,
-                callback=callback,
-            )
-            del clean_latents
-            del clean_latents_2x
-            del clean_latents_4x
-            del latent_indices
-            del clean_latent_indices
-            del clean_latent_2x_indices
-            del clean_latent_4x_indices
-            [total_generated_latent_frames, history_latents, history_pixels] = post_process(job_id, start_latent, generated_latents, total_generated_latent_frames, history_latents, high_vram, transformer, gpu, vae, history_pixels, latent_window_size, enable_preview, outputs_folder, mp4_crf, stream, is_last_section)
-            if is_last_section:
-                break
     except:
         traceback.print_exc()
@@ -1116,7 +1131,6 @@ def worker_video(input_video, end_frame, end_stillness, prompts, n_prompt, seed,
             if batch > 1:
                 print(f"Beginning video {idx+1} of {batch} with seed {seed} ")
-            #job_id = generate_timestamp()
             job_id = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")+f"_framepackf1-videoinput_{width}-{total_second_length}sec_seed-{seed}_steps-{steps}_distilled-{gs}_cfg-{cfg}" # 20250506 pftq: easier to read timestamp and filename
             # Sampling
@@ -1132,7 +1146,7 @@ def worker_video(input_video, end_frame, end_stillness, prompts, n_prompt, seed,
             # 20250509 Generate backwards with end frame for better end frame anchoring
             if total_latent_sections > 4:
-                latent_paddings = [3] + [2] * (total_latent_sections - 3) + [1, 0]
             else:
                 latent_paddings = list(reversed(range(total_latent_sections)))
@@ -1253,30 +1267,33 @@ def worker_video(input_video, end_frame, end_stillness, prompts, n_prompt, seed,
     stream.output_queue.push(('end', None))
     return
-def get_duration(input_image, end_image, image_position, end_stillness, prompts, generation_mode, n_prompt, seed, resolution, total_second_length, allocation_time, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number):
     return allocation_time
 @spaces.GPU(duration=get_duration)
-def process_on_gpu(input_image, end_image, image_position, end_stillness, prompts, generation_mode, n_prompt, seed, resolution, total_second_length, allocation_time, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number
            ):
     start = time.time()
     global stream
     stream = AsyncStream()
-    async_run(worker_start_end if generation_mode == "start_end" else worker, input_image, end_image, image_position, end_stillness, prompts, n_prompt, seed, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number)
     output_filename = None
     while True:
         flag, data = stream.output_queue.next()
         if flag == 'file':
             output_filename = data
-            yield gr.update(value=output_filename, label="Previewed Frames"), gr.skip(), gr.skip(), gr.skip(), gr.update(interactive=False), gr.update(interactive=True), gr.skip()
         if flag == 'progress':
             preview, desc, html = data
-            yield gr.update(label="Previewed Frames"), gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True), gr.skip()
         if flag == 'end':
             end = time.time()
@@ -1285,7 +1302,7 @@ def process_on_gpu(input_image, end_image, image_position, end_stillness, prompt
             secondes = secondes - (minutes * 60)
             hours = math.floor(minutes / 60)
             minutes = minutes - (hours * 60)
-            yield gr.update(value=output_filename, label="Finished Frames"), gr.update(visible=False), gr.skip(), "The process has lasted " + \
             ((str(hours) + " h, ") if hours != 0 else "") + \
             ((str(minutes) + " min, ") if hours != 0 or minutes != 0 else "") + \
             str(secondes) + " sec. " + \
@@ -1303,6 +1320,7 @@ def process(input_image,
             seed=31337,
             auto_allocation=True,
             allocation_time=180,
             resolution=640,
             total_second_length=5,
             latent_window_size=9,
@@ -1321,7 +1339,7 @@ def process(input_image,
     if torch.cuda.device_count() == 0:
         gr.Warning('Set this space to GPU config to make it work.')
-        yield gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.update(visible = False)
         return
     if randomize_seed:
@@ -1336,7 +1354,7 @@ def process(input_image,
     assert input_image is not None, 'No input image!'
     assert (generation_mode != "start_end") or end_image is not None, 'No end image!'
-    yield gr.update(label="Previewed Frames"), None, '', '', gr.update(interactive=False), gr.update(interactive=True), gr.skip()
     gc.collect()
     yield from process_on_gpu(input_image,
@@ -1347,6 +1365,7 @@ def process(input_image,
             generation_mode,
             n_prompt,
             seed,
             resolution,
             total_second_length,
             allocation_time,
@@ -1375,17 +1394,20 @@ def process_video_on_gpu(input_video, end_frame, end_stillness, prompts, n_promp
     async_run(worker_video, input_video, end_frame, end_stillness, prompts, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch)
     output_filename = None
     while True:
         flag, data = stream.output_queue.next()
         if flag == 'file':
             output_filename = data
-            yield gr.update(value=output_filename, label="Previewed Frames"), gr.skip(), gr.skip(), gr.skip(), gr.update(interactive=False), gr.update(interactive=True), gr.skip()
         if flag == 'progress':
             preview, desc, html = data
-            yield gr.update(label="Previewed Frames"), gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True), gr.skip() # 20250506 pftq: Keep refreshing the video in case it got hidden when the tab was in the background
         if flag == 'end':
             end = time.time()
@@ -1394,7 +1416,7 @@ def process_video_on_gpu(input_video, end_frame, end_stillness, prompts, n_promp
             secondes = secondes - (minutes * 60)
             hours = math.floor(minutes / 60)
             minutes = minutes - (hours * 60)
-            yield gr.update(value=output_filename, label="Finished Frames"), gr.update(visible=False), desc + \
             " The process has lasted " + \
             ((str(hours) + " h, ") if hours != 0 else "") + \
             ((str(minutes) + " min, ") if hours != 0 or minutes != 0 else "") + \
@@ -1409,7 +1431,7 @@ def process_video(input_video, end_frame, end_stillness, prompt, n_prompt, rando
     if torch.cuda.device_count() == 0:
         gr.Warning('Set this space to GPU config to make it work.')
-        yield gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.update(visible = False)
         return
     if randomize_seed:
@@ -1420,7 +1442,7 @@ def process_video(input_video, end_frame, end_stillness, prompt, n_prompt, rando
     # 20250506 pftq: Updated assertion for video input
     assert input_video is not None, 'No input video!'
-    yield gr.update(label="Previewed Frames"), None, '', '', gr.update(interactive=False), gr.update(interactive=True), gr.skip()
     # 20250507 pftq: Even the H100 needs offloading if the video dimensions are 720p or higher
     if high_vram and (no_resize or resolution>640):
@@ -1535,7 +1557,7 @@ with block:
                 enable_preview = gr.Checkbox(label='Enable preview', value=True, info='Display a preview around each second generated but it costs 2 sec. for each second generated.')
                 use_teacache = gr.Checkbox(label='Use TeaCache', value=False, info='Faster speed and no break in brightness, but often makes hands and fingers slightly worse.')
-                n_prompt = gr.Textbox(label="Negative Prompt", value="Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth", info='Requires using normal CFG (undistilled) instead of Distilled (set Distilled=1 and CFG > 1).')
                 fps_number = gr.Slider(label="Frame per seconds", info="The model is trained for 30 fps so other fps may generate weird results", minimum=10, maximum=60, value=30, step=1)
                 end_stillness = gr.Slider(label="End stillness", minimum=0, maximum=100, value=0, step=1, info='0=Realistic end; >0=Matches exactly the end image (but the time seems to freeze)')
@@ -1548,7 +1570,7 @@ with block:
                     resolution = gr.Dropdown([
                         ["409,600 px (working)", 640],
                         ["451,584 px (working)", 672],
-                        ["495,616 px (VRAM pb on HF)", 704],
                         ["589,824 px (not tested)", 768],
                         ["692,224 px (not tested)", 832],
                         ["746,496 px (not tested)", 864],
@@ -1576,7 +1598,7 @@ with block:
                 gpu_memory_preservation = gr.Slider(label="GPU Inference Preserved Memory (GB) (larger means slower)", minimum=6, maximum=128, value=6, step=0.1, info="Set this number to a larger value if you encounter OOM. Larger value causes slower speed.")
                 mp4_crf = gr.Slider(label="MP4 Compression", minimum=0, maximum=100, value=16, step=1, info="Lower means better quality. 0 is uncompressed. Change to 16 if you get black outputs. ")
-                batch = gr.Slider(label="Batch Size (Number of Videos)", minimum=1, maximum=1000, value=1, step=1, info='Generate multiple videos each with a different seed.')
                 with gr.Row():
                     randomize_seed = gr.Checkbox(label='Randomize seed', value=True, info='If checked, the seed is always different')
                     seed = gr.Slider(label="Seed", minimum=0, maximum=np.iinfo(np.int32).max, step=1, randomize=True)
@@ -1586,12 +1608,21 @@ with block:
         with gr.Column():
             warning = gr.HTML(elem_id="warning", value = "<center><big>Your computer must <u>not</u> enter into standby mode.</big><br/>On Chrome, you can force to keep a tab alive in <code>chrome://discards/</code></center>", visible = False)
-            result_video = gr.Video(label="Generated Frames", autoplay=True, show_share_button=False, height=512, loop=True)
-            preview_image = gr.Image(label="Next Latents", height=200, visible=False)
             progress_desc = gr.Markdown('', elem_classes='no-generating-animation')
             progress_bar = gr.HTML('', elem_classes='no-generating-animation')
-    ips = [input_image, end_image, image_position, end_stillness, final_prompt, generation_mode, n_prompt, randomize_seed, seed, auto_allocation, allocation_time, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number]
     ips_video = [input_video, end_image, end_stillness, final_prompt, n_prompt, randomize_seed, seed, auto_allocation, allocation_time, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch]
     gr.Examples(
@@ -1604,11 +1635,12 @@ with block:
                     1, # end_stillness
                     "Overcrowed street in Japan, photorealistic, realistic, intricate details, 8k, insanely detailed",
                     "text", # generation_mode
-                    "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth", # n_prompt
                     True, # randomize_seed
                     42, # seed
                     True, # auto_allocation
                     180, # allocation_time
                     672, # resolution
                     1, # total_second_length
                     9, # latent_window_size
@@ -1626,7 +1658,7 @@ with block:
         run_on_click = True,
         fn = process,
 	    inputs = ips,
-        outputs = [result_video, preview_image, progress_desc, progress_bar, start_button, end_button, warning],
         cache_examples = False,
     )
@@ -1640,11 +1672,12 @@ with block:
                     1, # end_stillness
                     "A dolphin emerges from the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
                     "image", # generation_mode
-                    "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth", # n_prompt
                     True, # randomize_seed
                     42, # seed
                     True, # auto_allocation
                     180, # allocation_time
                     672, # resolution
                     1, # total_second_length
                     9, # latent_window_size
@@ -1665,11 +1698,12 @@ with block:
                     1, # end_stillness
                     "A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The man talks and the woman listens; A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks, the man stops talking and the man listens; A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks and the man listens",
                     "image", # generation_mode
-                    "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth", # n_prompt
                     True, # randomize_seed
                     42, # seed
                     True, # auto_allocation
                     180, # allocation_time
                     672, # resolution
                     2, # total_second_length
                     9, # latent_window_size
@@ -1690,11 +1724,12 @@ with block:
                     1, # end_stillness
                     "A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks and the man listens; A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The man talks, the woman stops talking and the woman listens A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The man talks and the woman listens",
                     "image", # generation_mode
-                    "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth", # n_prompt
                     True, # randomize_seed
                     42, # seed
                     True, # auto_allocation
                     180, # allocation_time
                     672, # resolution
                     2, # total_second_length
                     9, # latent_window_size
@@ -1720,6 +1755,7 @@ with block:
                     42, # seed
                     True, # auto_allocation
                     180, # allocation_time
                     672, # resolution
                     1, # total_second_length
                     9, # latent_window_size
@@ -1740,11 +1776,12 @@ with block:
                     1, # end_stillness
                     "A building starting to explode, photorealistic, realisitc, 8k, insanely detailed",
                     "image", # generation_mode
-                    "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth", # n_prompt
                     True, # randomize_seed
                     42, # seed
                     True, # auto_allocation
                     180, # allocation_time
                     672, # resolution
                     1, # total_second_length
                     9, # latent_window_size
@@ -1762,7 +1799,7 @@ with block:
         run_on_click = True,
         fn = process,
 	    inputs = ips,
-        outputs = [result_video, preview_image, progress_desc, progress_bar, start_button, end_button, warning],
         cache_examples = False,
     )
@@ -1776,11 +1813,12 @@ with block:
                     0, # end_stillness
                     "A woman jumps out of the train and arrives on the ground, viewed from the outside, photorealistic, realistic, amateur photography, midday, insanely detailed, 8k", # prompt
                     "start_end", # generation_mode
-                    "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth, jumpcut, crossfader, crossfading", # n_prompt
                     True, # randomize_seed
                     42, # seed
                     True, # auto_allocation
                     180, # allocation_time
                     672, # resolution
                     1, # total_second_length
                     9, # latent_window_size
@@ -1798,7 +1836,7 @@ with block:
         run_on_click = True,
         fn = process,
 	    inputs = ips,
-        outputs = [result_video, preview_image, progress_desc, progress_bar, start_button, end_button, warning],
         cache_examples = False,
     )
@@ -1810,7 +1848,7 @@ with block:
                     None, # end_image
                     1, # end_stillness
                     "View of the sea as far as the eye can see, from the seaside, a piece of land is barely visible on the horizon at the middle, the sky is radiant, reflections of the sun in the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
-                    "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth, jumpcut, crossfader, crossfading", # n_prompt
                     True, # randomize_seed
                     42, # seed
                     True, # auto_allocation
@@ -1836,7 +1874,7 @@ with block:
                     "./img_examples/Example1.png", # end_image
                     1, # end_stillness
                     "View of the sea as far as the eye can see, from the seaside, a piece of land is barely visible on the horizon at the middle, the sky is radiant, reflections of the sun in the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
-                    "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth, jumpcut, crossfader, crossfading", # n_prompt
                     True, # randomize_seed
                     42, # seed
                     True, # auto_allocation
@@ -1861,7 +1899,7 @@ with block:
         run_on_click = True,
         fn = process_video,
 	    inputs = ips_video,
-	    outputs = [result_video, preview_image, progress_desc, progress_bar, start_button_video, end_button, warning],
         cache_examples = False,
     )
@@ -1899,7 +1937,6 @@ with block:
             gr.update(visible = True),  # start_button
             gr.update(visible = False), # start_button_video
             gr.update(visible = False), # no_resize
-            gr.update(visible = False), # batch
             gr.update(visible = False), # num_clean_frames
             gr.update(visible = False), # vae_batch
             gr.update(visible = False), # prompt_hint
@@ -1916,7 +1953,6 @@ with block:
             gr.update(visible = True),  # start_button
             gr.update(visible = False), # start_button_video
             gr.update(visible = False), # no_resize
-            gr.update(visible = False), # batch
             gr.update(visible = False), # num_clean_frames
             gr.update(visible = False), # vae_batch
             gr.update(visible = False), # prompt_hint
@@ -1933,7 +1969,6 @@ with block:
             gr.update(visible = True),  # start_button
             gr.update(visible = False), # start_button_video
             gr.update(visible = False), # no_resize
-            gr.update(visible = False), # batch
             gr.update(visible = False), # num_clean_frames
             gr.update(visible = False), # vae_batch
             gr.update(visible = False), # prompt_hint
@@ -1950,7 +1985,6 @@ with block:
             gr.update(visible = False), # start_button
             gr.update(visible = True),  # start_button_video
             gr.update(visible = True),  # no_resize
-            gr.update(visible = True),  # batch
             gr.update(visible = True),  # num_clean_frames
             gr.update(visible = True),  # vae_batch
             gr.update(visible = True),  # prompt_hint
@@ -1961,10 +1995,10 @@ with block:
     timeless_prompt.change(fn=handle_timeless_prompt_change, inputs=[timeless_prompt], outputs=[final_prompt])
     start_button.click(fn = check_parameters, inputs = [
         generation_mode, input_image, input_video
-    ], outputs = [end_button, warning], queue = False, show_progress = False).success(fn=process, inputs=ips, outputs=[result_video, preview_image, progress_desc, progress_bar, start_button, end_button, warning], scroll_to_output = True)
     start_button_video.click(fn = check_parameters, inputs = [
         generation_mode, input_image, input_video
-    ], outputs = [end_button, warning], queue = False, show_progress = False).success(fn=process_video, inputs=ips_video, outputs=[result_video, preview_image, progress_desc, progress_bar, start_button_video, end_button, warning], scroll_to_output = True)
     end_button.click(fn=end_process)
     generation_mode.change(fn = save_preferences, inputs = [
@@ -1977,7 +2011,7 @@ with block:
     generation_mode.change(
         fn=handle_generation_mode_change,
         inputs=[generation_mode],
-        outputs=[text_to_video_hint, image_position, input_image, end_image, end_stillness, input_video, start_button, start_button_video, no_resize, batch, num_clean_frames, vae_batch, prompt_hint, fps_number]
     )
     # Update display when the page loads
@@ -1985,7 +2019,7 @@ with block:
         fn=handle_generation_mode_change, inputs = [
         generation_mode
     ], outputs = [
-       text_to_video_hint, image_position, input_image, end_image, end_stillness, input_video, start_button, start_button_video, no_resize, batch, num_clean_frames, vae_batch, prompt_hint, fps_number
     ]
     )

 from diffusers import AutoencoderKLHunyuanVideo
 from transformers import LlamaModel, CLIPTextModel, LlamaTokenizerFast, CLIPTokenizer
 from diffusers_helper.hunyuan import encode_prompt_conds, vae_decode, vae_encode, vae_decode_fake
+from diffusers_helper.utils import save_bcthw_as_mp4, crop_or_pad_yield_mask, soft_append_bcthw, resize_and_center_crop, state_dict_weighted_merge, state_dict_offset_merge
 from diffusers_helper.models.hunyuan_video_packed import HunyuanVideoTransformer3DModelPacked
 from diffusers_helper.pipelines.k_diffusion_hunyuan import sample_hunyuan
 if torch.cuda.device_count() > 0:
         raise
 @torch.no_grad()
+def worker(input_image, end_image, image_position, end_stillness, prompts, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number):
     def encode_prompt(prompt, n_prompt):
         llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
     section_index = first_section_index
     forward = (image_position == 0)
     stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
     try:
         image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
+        for idx in range(batch):
+            if batch > 1:
+                print(f"Beginning video {idx+1} of {batch} with seed {seed} ")
+            job_id = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")+f"_framepackf1_{width}-{total_second_length}sec_seed-{seed}_steps-{steps}_distilled-{gs}_cfg-{cfg}"
+            # Sampling
+            stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Start sampling ...'))))
+            rnd = torch.Generator("cpu").manual_seed(seed)
+            history_latents = torch.zeros(size=(1, 16, 16 + 2 + 1, height // 8, width // 8), dtype=torch.float32, device=cpu)
+            start_latent = start_latent.to(history_latents)
+            history_pixels = None
+            history_latents = torch.cat([history_latents, start_latent] if forward else [start_latent, history_latents], dim=2)
+            total_generated_latent_frames = 1
+            if enable_preview:
+                def callback(d):
+                    preview = d['denoised']
+                    preview = vae_decode_fake(preview)
+                    preview = (preview * 255.0).detach().cpu().numpy().clip(0, 255).astype(np.uint8)
+                    preview = einops.rearrange(preview, 'b c t h w -> (b h) (t w) c')
+                    if stream.input_queue.top() == 'end':
+                        stream.output_queue.push(('end', None))
+                        raise KeyboardInterrupt('User ends the task.')
+                    current_step = d['i'] + 1
+                    percentage = int(100.0 * current_step / steps)
+                    hint = f'Sampling {current_step}/{steps}'
+                    desc = f'Total generated frames: {int(max(0, total_generated_latent_frames * 4 - 3))}, Video length: {max(0, (total_generated_latent_frames * 4 - 3) / fps_number) :.2f} seconds (FPS-30), Resolution: {height}px * {width}px, Video {idx+1} of {batch}. The video is being extended now ...'
+                    stream.output_queue.push(('progress', (preview, desc, make_progress_bar_html(percentage, hint))))
+                    return
             else:
+                def callback(d):
+                    return
+            indices = torch.arange(0, 1 + 16 + 2 + 1 + latent_window_size).unsqueeze(0)
             if forward:
+                clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices = indices.split([1, 16, 2, 1, latent_window_size], dim=1)
+                clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
             else:
+                latent_indices, clean_latent_1x_indices, clean_latent_2x_indices, clean_latent_4x_indices, clean_latent_indices_start = indices.split([latent_window_size, 1, 2, 16, 1], dim=1)
+                clean_latent_indices = torch.cat([clean_latent_1x_indices, clean_latent_indices_start], dim=1)
+            def post_process(forward, generated_latents, total_generated_latent_frames, history_latents, high_vram, transformer, gpu, vae, history_pixels, latent_window_size, enable_preview, section_index, total_latent_sections, outputs_folder, mp4_crf, stream):
+                total_generated_latent_frames += int(generated_latents.shape[2])
+                history_latents = torch.cat([history_latents, generated_latents.to(history_latents)] if forward else [generated_latents.to(history_latents), history_latents], dim=2)
+                if not high_vram:
+                    offload_model_from_device_for_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=8)
+                    load_model_as_complete(vae, target_device=gpu)
+                if history_pixels is None:
+                    real_history_latents = history_latents[:, :, -total_generated_latent_frames:, :, :] if forward else history_latents[:, :, :total_generated_latent_frames, :, :]
+                    history_pixels = vae_decode(real_history_latents, vae).cpu()
                 else:
+                    section_latent_frames = latent_window_size * 2
+                    overlapped_frames = latent_window_size * 4 - 3
+                    if forward:
+                        real_history_latents = history_latents[:, :, -min(section_latent_frames, total_generated_latent_frames):, :, :]
+                        history_pixels = soft_append_bcthw(history_pixels, vae_decode(real_history_latents, vae).cpu(), overlapped_frames)
+                    else:
+                        real_history_latents = history_latents[:, :, :min(section_latent_frames, total_generated_latent_frames), :, :]
+                        history_pixels = soft_append_bcthw(vae_decode(real_history_latents, vae).cpu(), history_pixels, overlapped_frames)
+                if not high_vram:
+                    unload_complete_models(text_encoder, text_encoder_2, image_encoder, vae, transformer)
+                if enable_preview or section_index == (0 if first_section_index == (total_latent_sections - 1) else (total_latent_sections - 1)):
+                    output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
+                    save_bcthw_as_mp4(history_pixels, output_filename, fps=fps_number, crf=mp4_crf)
+                    print(f'Decoded. Current latent shape pixel shape {history_pixels.shape}')
+                    stream.output_queue.push(('file', output_filename))
+                return [total_generated_latent_frames, history_latents, history_pixels]
+            while section_index < total_latent_sections:
+                if stream.input_queue.top() == 'end':
+                    stream.output_queue.push(('end', None))
+                    return
+                print(f'section_index = {section_index}, total_latent_sections = {total_latent_sections}')
+                prompt_index = min(section_index, len(prompt_parameters) - 1)
+                [llama_vec, clip_l_pooler, llama_vec_n, clip_l_pooler_n, llama_attention_mask, llama_attention_mask_n] = prompt_parameters[prompt_index]
+                if prompt_index < len(prompt_parameters) - 1 or (prompt_index == total_latent_sections - 1):
+                    del prompt_parameters[prompt_index]
+                if not high_vram:
+                    unload_complete_models()
+                    move_model_to_device_with_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=gpu_memory_preservation)
+                if use_teacache:
+                    transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
+                else:
+                    transformer.initialize_teacache(enable_teacache=False)
+                if forward:
+                    clean_latents_4x, clean_latents_2x, clean_latents_1x = history_latents[:, :, -(16 + 2 + 1):, :, :].split([16, 2, 1], dim=2)
+                    clean_latents = torch.cat([start_latent, clean_latents_1x], dim=2)
+                else:
+                    clean_latents_1x, clean_latents_2x, clean_latents_4x = history_latents[:, :, :(1 + 2 + 16), :, :].split([1, 2, 16], dim=2)
+                    clean_latents = torch.cat([clean_latents_1x, start_latent], dim=2)
+                generated_latents = sample_hunyuan(
+                    transformer=transformer,
+                    sampler='unipc',
+                    width=width,
+                    height=height,
+                    frames=latent_window_size * 4 - 3,
+                    real_guidance_scale=cfg,
+                    distilled_guidance_scale=gs,
+                    guidance_rescale=rs,
+                    # shift=3.0,
+                    num_inference_steps=steps,
+                    generator=rnd,
+                    prompt_embeds=llama_vec,
+                    prompt_embeds_mask=llama_attention_mask,
+                    prompt_poolers=clip_l_pooler,
+                    negative_prompt_embeds=llama_vec_n,
+                    negative_prompt_embeds_mask=llama_attention_mask_n,
+                    negative_prompt_poolers=clip_l_pooler_n,
+                    device=gpu,
+                    dtype=torch.bfloat16,
+                    image_embeddings=image_encoder_last_hidden_state,
+                    latent_indices=latent_indices,
+                    clean_latents=clean_latents,
+                    clean_latent_indices=clean_latent_indices,
+                    clean_latents_2x=clean_latents_2x,
+                    clean_latent_2x_indices=clean_latent_2x_indices,
+                    clean_latents_4x=clean_latents_4x,
+                    clean_latent_4x_indices=clean_latent_4x_indices,
+                    callback=callback,
+                )
+                del clean_latents
+                del clean_latents_2x
+                del clean_latents_4x
+                del latent_indices
+                del clean_latent_indices
+                del clean_latent_2x_indices
+                del clean_latent_4x_indices
+                [total_generated_latent_frames, history_latents, history_pixels] = post_process(forward, generated_latents, total_generated_latent_frames, history_latents, high_vram, transformer, gpu, vae, history_pixels, latent_window_size, enable_preview, section_index, total_latent_sections, outputs_folder, mp4_crf, stream)
+                if not forward:
+                    if section_index > 0:
+                        section_index -= 1
+                    else:
+                        clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices = indices.split([1, 16, 2, 1, latent_window_size], dim=1)
+                        clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
+                        real_history_latents = history_latents[:, :, :total_generated_latent_frames, :, :]
+                        zero_latents = history_latents[:, :, total_generated_latent_frames:, :, :]
+                        history_latents = torch.cat([zero_latents, real_history_latents], dim=2)
+                        del real_history_latents
+                        del zero_latents
+                        forward = True
+                        section_index = first_section_index
+                if forward:
+                    section_index += 1
+            seed = (seed + 1) % np.iinfo(np.int32).max
     except:
         traceback.print_exc()
     return
 @torch.no_grad()
+def worker_start_end(input_image, end_image, image_position, end_stillness, prompts, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number):
     def encode_prompt(prompt, n_prompt):
         llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
     total_latent_sections = (total_second_length * fps_number) / (latent_window_size * 4)
     total_latent_sections = int(max(round(total_latent_sections), 1))
     stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
                 load_model_as_complete(vae, target_device=gpu)
             start_latent = vae_encode(input_image_pt, vae)
+            del input_image_pt
             if has_end_image:
                 end_latent = vae_encode(end_image_pt, vae)
+                del end_image_pt
             # CLIP Vision
             stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'CLIP Vision encoding ...'))))
                 load_model_as_complete(image_encoder, target_device=gpu)
             image_encoder_output = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder)
+            del input_image_np
             image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
             if has_end_image:
         # Dtype
         image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
+        for idx in range(batch):
+            if batch > 1:
+                print(f"Beginning video {idx+1} of {batch} with seed {seed} ")
+            job_id = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")+f"_framepackse_{width}-{total_second_length}sec_seed-{seed}_steps-{steps}_distilled-{gs}_cfg-{cfg}"
+            # Sampling
+            stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Start sampling ...'))))
+            rnd = torch.Generator("cpu").manual_seed(seed)
+            num_frames = latent_window_size * 4 - 3
+            history_latents = torch.zeros(size=(1, 16, 1 + 2 + 16, height // 8, width // 8), dtype=torch.float32, device=cpu)
+            start_latent = start_latent.to(history_latents)
+            if has_end_image:
+                end_latent = end_latent.to(history_latents)
+            history_pixels = None
+            total_generated_latent_frames = 0
+            if total_latent_sections > 4:
+                # In theory the latent_paddings should follow the else sequence, but it seems that duplicating some
+                # items looks better than expanding it when total_latent_sections > 4
+                # One can try to remove below trick and just
+                # use `latent_paddings = list(reversed(range(total_latent_sections)))` to compare
+                latent_paddings = [3] + [2] * (total_latent_sections - 3) + [1, 0]
             else:
+                # Convert an iterator to a list
+                latent_paddings = list(range(total_latent_sections - 1, -1, -1))
+            if enable_preview:
+                def callback(d):
+                    preview = d['denoised']
+                    preview = vae_decode_fake(preview)
+                    preview = (preview * 255.0).detach().cpu().numpy().clip(0, 255).astype(np.uint8)
+                    preview = einops.rearrange(preview, 'b c t h w -> (b h) (t w) c')
+                    if stream.input_queue.top() == 'end':
+                        stream.output_queue.push(('end', None))
+                        raise KeyboardInterrupt('User ends the task.')
+                    current_step = d['i'] + 1
+                    percentage = int(100.0 * current_step / steps)
+                    hint = f'Sampling {current_step}/{steps}'
+                    desc = f'Total generated frames: {int(max(0, total_generated_latent_frames * 4 - 3))}, Video length: {max(0, (total_generated_latent_frames * 4 - 3) / fps_number) :.2f} seconds (FPS-30), Resolution: {height}px * {width}px, Video {idx+1} of {batch}. The video is being extended now ...'
+                    stream.output_queue.push(('progress', (preview, desc, make_progress_bar_html(percentage, hint))))
+                    return
+            else:
+                def callback(d):
+                    return
+            def post_process(job_id, start_latent, generated_latents, total_generated_latent_frames, history_latents, high_vram, transformer, gpu, vae, history_pixels, latent_window_size, enable_preview, outputs_folder, mp4_crf, stream, is_last_section):
+                if is_last_section:
+                    generated_latents = torch.cat([start_latent.to(generated_latents), generated_latents], dim=2)
+                total_generated_latent_frames += int(generated_latents.shape[2])
+                history_latents = torch.cat([generated_latents.to(history_latents), history_latents], dim=2)
+                if not high_vram:
+                    offload_model_from_device_for_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=8)
+                    load_model_as_complete(vae, target_device=gpu)
+                if history_pixels is None:
+                    history_pixels = vae_decode(history_latents[:, :, :total_generated_latent_frames, :, :], vae).cpu()
+                else:
+                    section_latent_frames = (latent_window_size * 2 + 1) if is_last_section else (latent_window_size * 2)
+                    overlapped_frames = latent_window_size * 4 - 3
+                    current_pixels = vae_decode(history_latents[:, :, :min(total_generated_latent_frames, section_latent_frames)], vae).cpu()
+                    history_pixels = soft_append_bcthw(current_pixels, history_pixels, overlapped_frames)
+                if not high_vram:
+                    unload_complete_models(vae)
+                if enable_preview or is_last_section:
+                    output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
+                    save_bcthw_as_mp4(history_pixels, output_filename, fps=fps_number, crf=mp4_crf)
+                    print(f'Decoded. Pixel shape {history_pixels.shape}')
+                    stream.output_queue.push(('file', output_filename))
+                return [total_generated_latent_frames, history_latents, history_pixels]
+            for latent_padding in latent_paddings:
+                is_last_section = latent_padding == 0
+                is_first_section = latent_padding == latent_paddings[0]
+                latent_padding_size = latent_padding * latent_window_size
+                if stream.input_queue.top() == 'end':
+                    stream.output_queue.push(('end', None))
+                    return
+                print(f'latent_padding_size = {latent_padding_size}, is_last_section = {is_last_section}, is_first_section = {is_first_section}')
+                if len(prompt_parameters) > 0:
+                    [llama_vec, clip_l_pooler, llama_vec_n, clip_l_pooler_n, llama_attention_mask, llama_attention_mask_n] = prompt_parameters.pop(len(prompt_parameters) - 1)
+                indices = torch.arange(1 + latent_padding_size + latent_window_size + 1 + (end_stillness if is_first_section else 0) + 2 + 16).unsqueeze(0)
+                clean_latent_indices_pre, blank_indices, latent_indices, clean_latent_indices_post, clean_latent_2x_indices, clean_latent_4x_indices = indices.split([1, latent_padding_size, latent_window_size, 1 + (end_stillness if is_first_section else 0), 2, 16], dim=1)
+                clean_latent_indices = torch.cat([clean_latent_indices_pre, clean_latent_indices_post], dim=1)
+                clean_latents_post, clean_latents_2x, clean_latents_4x = history_latents[:, :, :1 + 2 + 16, :, :].split([1, 2, 16], dim=2)
+                # Use end image latent for the first section if provided
+                if has_end_image and is_first_section:
+                    clean_latents_post = end_latent.expand(-1, -1, 1 + end_stillness, -1, -1)
+                clean_latents = torch.cat([start_latent, clean_latents_post], dim=2)
+                if not high_vram:
+                    unload_complete_models()
+                    move_model_to_device_with_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=gpu_memory_preservation)
+                if use_teacache:
+                    transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
+                else:
+                    transformer.initialize_teacache(enable_teacache=False)
+                generated_latents = sample_hunyuan(
+                    transformer=transformer,
+                    sampler='unipc',
+                    width=width,
+                    height=height,
+                    frames=num_frames,
+                    real_guidance_scale=cfg,
+                    distilled_guidance_scale=gs,
+                    guidance_rescale=rs,
+                    # shift=3.0,
+                    num_inference_steps=steps,
+                    generator=rnd,
+                    prompt_embeds=llama_vec,
+                    prompt_embeds_mask=llama_attention_mask,
+                    prompt_poolers=clip_l_pooler,
+                    negative_prompt_embeds=llama_vec_n,
+                    negative_prompt_embeds_mask=llama_attention_mask_n,
+                    negative_prompt_poolers=clip_l_pooler_n,
+                    device=gpu,
+                    dtype=torch.bfloat16,
+                    image_embeddings=image_encoder_last_hidden_state,
+                    latent_indices=latent_indices,
+                    clean_latents=clean_latents,
+                    clean_latent_indices=clean_latent_indices,
+                    clean_latents_2x=clean_latents_2x,
+                    clean_latent_2x_indices=clean_latent_2x_indices,
+                    clean_latents_4x=clean_latents_4x,
+                    clean_latent_4x_indices=clean_latent_4x_indices,
+                    callback=callback,
+                )
+                del clean_latents
+                del clean_latents_2x
+                del clean_latents_4x
+                del latent_indices
+                del clean_latent_indices
+                del clean_latent_2x_indices
+                del clean_latent_4x_indices
+                [total_generated_latent_frames, history_latents, history_pixels] = post_process(job_id, start_latent, generated_latents, total_generated_latent_frames, history_latents, high_vram, transformer, gpu, vae, history_pixels, latent_window_size, enable_preview, outputs_folder, mp4_crf, stream, is_last_section)
+                if is_last_section:
+                    break
+            seed = (seed + 1) % np.iinfo(np.int32).max
     except:
         traceback.print_exc()
             if batch > 1:
                 print(f"Beginning video {idx+1} of {batch} with seed {seed} ")
             job_id = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")+f"_framepackf1-videoinput_{width}-{total_second_length}sec_seed-{seed}_steps-{steps}_distilled-{gs}_cfg-{cfg}" # 20250506 pftq: easier to read timestamp and filename
             # Sampling
             # 20250509 Generate backwards with end frame for better end frame anchoring
             if total_latent_sections > 4:
+                latent_paddings = [3, 2] + [1] * (total_latent_sections - 3) + [0]
             else:
                 latent_paddings = list(reversed(range(total_latent_sections)))
     stream.output_queue.push(('end', None))
     return
+def get_duration(input_image, end_image, image_position, end_stillness, prompts, generation_mode, n_prompt, seed, batch, resolution, total_second_length, allocation_time, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number):
     return allocation_time
 @spaces.GPU(duration=get_duration)
+def process_on_gpu(input_image, end_image, image_position, end_stillness, prompts, generation_mode, n_prompt, seed, batch, resolution, total_second_length, allocation_time, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number
            ):
     start = time.time()
     global stream
     stream = AsyncStream()
+    async_run(worker_start_end if generation_mode == "start_end" else worker, input_image, end_image, image_position, end_stillness, prompts, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number)
     output_filename = None
+    output_filenames = ""
     while True:
         flag, data = stream.output_queue.next()
         if flag == 'file':
             output_filename = data
+            output_filenames = output_filenames + ";" + str(output_filename)
+            print("output_filename=" + str(output_filename))
+            yield gr.update(value=output_filename, label="Previewed Frames"), gr.update(value=output_filenames, visible=True), gr.skip(), gr.skip(), gr.skip(), gr.update(interactive=False), gr.update(interactive=True), gr.skip()
         if flag == 'progress':
             preview, desc, html = data
+            yield gr.update(label="Previewed Frames"), gr.skip(), gr.skip(), gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True), gr.skip()
         if flag == 'end':
             end = time.time()
             secondes = secondes - (minutes * 60)
             hours = math.floor(minutes / 60)
             minutes = minutes - (hours * 60)
+            yield gr.update(value=output_filename, label="Finished Frames"), gr.update(value=output_filenames, visible=True), gr.update(visible=False), gr.skip(), "The process has lasted " + \
             ((str(hours) + " h, ") if hours != 0 else "") + \
             ((str(minutes) + " min, ") if hours != 0 or minutes != 0 else "") + \
             str(secondes) + " sec. " + \
             seed=31337,
             auto_allocation=True,
             allocation_time=180,
+            batch=1,
             resolution=640,
             total_second_length=5,
             latent_window_size=9,
     if torch.cuda.device_count() == 0:
         gr.Warning('Set this space to GPU config to make it work.')
+        yield gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.update(visible = False)
         return
     if randomize_seed:
     assert input_image is not None, 'No input image!'
     assert (generation_mode != "start_end") or end_image is not None, 'No end image!'
+    yield gr.update(label="Previewed Frames"), gr.update(value = ""), None, '', '', gr.update(interactive=False), gr.update(interactive=True), gr.skip()
     gc.collect()
     yield from process_on_gpu(input_image,
             generation_mode,
             n_prompt,
             seed,
+            batch,
             resolution,
             total_second_length,
             allocation_time,
     async_run(worker_video, input_video, end_frame, end_stillness, prompts, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch)
     output_filename = None
+    output_filenames = ""
     while True:
         flag, data = stream.output_queue.next()
         if flag == 'file':
             output_filename = data
+            print("output_filename=" + str(output_filename))
+            output_filenames = output_filenames + ";" + str(output_filename)
+            yield gr.update(value=output_filename, label="Previewed Frames"), gr.update(value=output_filenames, visible=True), gr.skip(), gr.skip(), gr.skip(), gr.update(interactive=False), gr.update(interactive=True), gr.skip()
         if flag == 'progress':
             preview, desc, html = data
+            yield gr.update(label="Previewed Frames"), gr.skip(), gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True), gr.skip() # 20250506 pftq: Keep refreshing the video in case it got hidden when the tab was in the background
         if flag == 'end':
             end = time.time()
             secondes = secondes - (minutes * 60)
             hours = math.floor(minutes / 60)
             minutes = minutes - (hours * 60)
+            yield gr.update(value=output_filename, label="Finished Frames"), gr.update(value=output_filenames, visible=True), gr.update(visible=False), desc + \
             " The process has lasted " + \
             ((str(hours) + " h, ") if hours != 0 else "") + \
             ((str(minutes) + " min, ") if hours != 0 or minutes != 0 else "") + \
     if torch.cuda.device_count() == 0:
         gr.Warning('Set this space to GPU config to make it work.')
+        yield gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.update(visible = False)
         return
     if randomize_seed:
     # 20250506 pftq: Updated assertion for video input
     assert input_video is not None, 'No input video!'
+    yield gr.update(label="Previewed Frames"), gr.update(value = ""), None, '', '', gr.update(interactive=False), gr.update(interactive=True), gr.skip()
     # 20250507 pftq: Even the H100 needs offloading if the video dimensions are 720p or higher
     if high_vram and (no_resize or resolution>640):
                 enable_preview = gr.Checkbox(label='Enable preview', value=True, info='Display a preview around each second generated but it costs 2 sec. for each second generated.')
                 use_teacache = gr.Checkbox(label='Use TeaCache', value=False, info='Faster speed and no break in brightness, but often makes hands and fingers slightly worse.')
+                n_prompt = gr.Textbox(label="Negative Prompt", value="Missing arm, long hand, sagging skin, unjustified body movement, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth, abrupt change in camera trajectory, jumpcut, crossfader, crossfading", info='Requires using normal CFG (undistilled) instead of Distilled (set Distilled=1 and CFG > 1).')
                 fps_number = gr.Slider(label="Frame per seconds", info="The model is trained for 30 fps so other fps may generate weird results", minimum=10, maximum=60, value=30, step=1)
                 end_stillness = gr.Slider(label="End stillness", minimum=0, maximum=100, value=0, step=1, info='0=Realistic end; >0=Matches exactly the end image (but the time seems to freeze)')
                     resolution = gr.Dropdown([
                         ["409,600 px (working)", 640],
                         ["451,584 px (working)", 672],
+                        ["495,616 px (working for extension)", 704],
                         ["589,824 px (not tested)", 768],
                         ["692,224 px (not tested)", 832],
                         ["746,496 px (not tested)", 864],
                 gpu_memory_preservation = gr.Slider(label="GPU Inference Preserved Memory (GB) (larger means slower)", minimum=6, maximum=128, value=6, step=0.1, info="Set this number to a larger value if you encounter OOM. Larger value causes slower speed.")
                 mp4_crf = gr.Slider(label="MP4 Compression", minimum=0, maximum=100, value=16, step=1, info="Lower means better quality. 0 is uncompressed. Change to 16 if you get black outputs. ")
+                batch = gr.Slider(label="Batch Size (number of videos)", minimum=1, maximum=1000, value=1, step=1, info='Generate multiple videos each with a different seed')
                 with gr.Row():
                     randomize_seed = gr.Checkbox(label='Randomize seed', value=True, info='If checked, the seed is always different')
                     seed = gr.Slider(label="Seed", minimum=0, maximum=np.iinfo(np.int32).max, step=1, randomize=True)
         with gr.Column():
             warning = gr.HTML(elem_id="warning", value = "<center><big>Your computer must <u>not</u> enter into standby mode.</big><br/>On Chrome, you can force to keep a tab alive in <code>chrome://discards/</code></center>", visible = False)
+            result_video = gr.Video(label="Generated Frames", autoplay = True, show_share_button = False, height = 512, loop = True)
+            download_textbox = gr.HTML(label="Download list", visible = False)
+            @gr.render(inputs=download_textbox)
+            def show_split(download_textbox):
+                if len(download_textbox) > 0:
+                    pathes = download_textbox.split(";")[1:]
+                    for one_path in pathes:
+                        one_download_button = gr.DownloadButton(label="Download", value=one_path)
+            preview_image = gr.Image(label="Next Latents", height = 200, visible = False)
             progress_desc = gr.Markdown('', elem_classes='no-generating-animation')
             progress_bar = gr.HTML('', elem_classes='no-generating-animation')
+    ips = [input_image, end_image, image_position, end_stillness, final_prompt, generation_mode, n_prompt, randomize_seed, seed, auto_allocation, allocation_time, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number]
     ips_video = [input_video, end_image, end_stillness, final_prompt, n_prompt, randomize_seed, seed, auto_allocation, allocation_time, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch]
     gr.Examples(
                     1, # end_stillness
                     "Overcrowed street in Japan, photorealistic, realistic, intricate details, 8k, insanely detailed",
                     "text", # generation_mode
+                    "Missing arm, long hand, sagging skin, unjustified body movement, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth, abrupt change in camera trajectory, jumpcut, crossfader, crossfading", # n_prompt
                     True, # randomize_seed
                     42, # seed
                     True, # auto_allocation
                     180, # allocation_time
+                    1, # batch
                     672, # resolution
                     1, # total_second_length
                     9, # latent_window_size
         run_on_click = True,
         fn = process,
 	    inputs = ips,
+        outputs = [result_video, download_textbox, preview_image, progress_desc, progress_bar, start_button, end_button, warning],
         cache_examples = False,
     )
                     1, # end_stillness
                     "A dolphin emerges from the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
                     "image", # generation_mode
+                    "Missing arm, long hand, sagging skin, unjustified body movement, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth, abrupt change in camera trajectory, jumpcut, crossfader, crossfading", # n_prompt
                     True, # randomize_seed
                     42, # seed
                     True, # auto_allocation
                     180, # allocation_time
+                    1, # batch
                     672, # resolution
                     1, # total_second_length
                     9, # latent_window_size
                     1, # end_stillness
                     "A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The man talks and the woman listens; A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks, the man stops talking and the man listens; A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks and the man listens",
                     "image", # generation_mode
+                    "Missing arm, long hand, sagging skin, unjustified body movement, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth, abrupt change in camera trajectory, jumpcut, crossfader, crossfading", # n_prompt
                     True, # randomize_seed
                     42, # seed
                     True, # auto_allocation
                     180, # allocation_time
+                    1, # batch
                     672, # resolution
                     2, # total_second_length
                     9, # latent_window_size
                     1, # end_stillness
                     "A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks and the man listens; A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The man talks, the woman stops talking and the woman listens A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The man talks and the woman listens",
                     "image", # generation_mode
+                    "Missing arm, long hand, sagging skin, unjustified body movement, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth, abrupt change in camera trajectory, jumpcut, crossfader, crossfading", # n_prompt
                     True, # randomize_seed
                     42, # seed
                     True, # auto_allocation
                     180, # allocation_time
+                    1, # batch
                     672, # resolution
                     2, # total_second_length
                     9, # latent_window_size
                     42, # seed
                     True, # auto_allocation
                     180, # allocation_time
+                    1, # batch
                     672, # resolution
                     1, # total_second_length
                     9, # latent_window_size
                     1, # end_stillness
                     "A building starting to explode, photorealistic, realisitc, 8k, insanely detailed",
                     "image", # generation_mode
+                    "Missing arm, long hand, sagging skin, unjustified body movement, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth, abrupt change in camera trajectory, jumpcut, crossfader, crossfading", # n_prompt
                     True, # randomize_seed
                     42, # seed
                     True, # auto_allocation
                     180, # allocation_time
+                    1, # batch
                     672, # resolution
                     1, # total_second_length
                     9, # latent_window_size
         run_on_click = True,
         fn = process,
 	    inputs = ips,
+        outputs = [result_video, download_textbox, preview_image, progress_desc, progress_bar, start_button, end_button, warning],
         cache_examples = False,
     )
                     0, # end_stillness
                     "A woman jumps out of the train and arrives on the ground, viewed from the outside, photorealistic, realistic, amateur photography, midday, insanely detailed, 8k", # prompt
                     "start_end", # generation_mode
+                    "Missing arm, long hand, sagging skin, unjustified body movement, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth, abrupt change in camera trajectory, jumpcut, crossfader, crossfading", # n_prompt
                     True, # randomize_seed
                     42, # seed
                     True, # auto_allocation
                     180, # allocation_time
+                    1, # batch
                     672, # resolution
                     1, # total_second_length
                     9, # latent_window_size
         run_on_click = True,
         fn = process,
 	    inputs = ips,
+        outputs = [result_video, download_textbox, preview_image, progress_desc, progress_bar, start_button, end_button, warning],
         cache_examples = False,
     )
                     None, # end_image
                     1, # end_stillness
                     "View of the sea as far as the eye can see, from the seaside, a piece of land is barely visible on the horizon at the middle, the sky is radiant, reflections of the sun in the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
+                    "Missing arm, long hand, sagging skin, unjustified body movement, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth, abrupt change in camera trajectory, jumpcut, crossfader, crossfading", # n_prompt
                     True, # randomize_seed
                     42, # seed
                     True, # auto_allocation
                     "./img_examples/Example1.png", # end_image
                     1, # end_stillness
                     "View of the sea as far as the eye can see, from the seaside, a piece of land is barely visible on the horizon at the middle, the sky is radiant, reflections of the sun in the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
+                    "Missing arm, long hand, sagging skin, unjustified body movement, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth, abrupt change in camera trajectory, jumpcut, crossfader, crossfading", # n_prompt
                     True, # randomize_seed
                     42, # seed
                     True, # auto_allocation
         run_on_click = True,
         fn = process_video,
 	    inputs = ips_video,
+	    outputs = [result_video, download_textbox, preview_image, progress_desc, progress_bar, start_button_video, end_button, warning],
         cache_examples = False,
     )
             gr.update(visible = True),  # start_button
             gr.update(visible = False), # start_button_video
             gr.update(visible = False), # no_resize
             gr.update(visible = False), # num_clean_frames
             gr.update(visible = False), # vae_batch
             gr.update(visible = False), # prompt_hint
             gr.update(visible = True),  # start_button
             gr.update(visible = False), # start_button_video
             gr.update(visible = False), # no_resize
             gr.update(visible = False), # num_clean_frames
             gr.update(visible = False), # vae_batch
             gr.update(visible = False), # prompt_hint
             gr.update(visible = True),  # start_button
             gr.update(visible = False), # start_button_video
             gr.update(visible = False), # no_resize
             gr.update(visible = False), # num_clean_frames
             gr.update(visible = False), # vae_batch
             gr.update(visible = False), # prompt_hint
             gr.update(visible = False), # start_button
             gr.update(visible = True),  # start_button_video
             gr.update(visible = True),  # no_resize
             gr.update(visible = True),  # num_clean_frames
             gr.update(visible = True),  # vae_batch
             gr.update(visible = True),  # prompt_hint
     timeless_prompt.change(fn=handle_timeless_prompt_change, inputs=[timeless_prompt], outputs=[final_prompt])
     start_button.click(fn = check_parameters, inputs = [
         generation_mode, input_image, input_video
+    ], outputs = [end_button, warning], queue = False, show_progress = False).success(fn=process, inputs=ips, outputs=[result_video, download_textbox, preview_image, progress_desc, progress_bar, start_button, end_button, warning], scroll_to_output = True)
     start_button_video.click(fn = check_parameters, inputs = [
         generation_mode, input_image, input_video
+    ], outputs = [end_button, warning], queue = False, show_progress = False).success(fn=process_video, inputs=ips_video, outputs=[result_video, download_textbox, preview_image, progress_desc, progress_bar, start_button_video, end_button, warning], scroll_to_output = True)
     end_button.click(fn=end_process)
     generation_mode.change(fn = save_preferences, inputs = [
     generation_mode.change(
         fn=handle_generation_mode_change,
         inputs=[generation_mode],
+        outputs=[text_to_video_hint, image_position, input_image, end_image, end_stillness, input_video, start_button, start_button_video, no_resize, num_clean_frames, vae_batch, prompt_hint, fps_number]
     )
     # Update display when the page loads
         fn=handle_generation_mode_change, inputs = [
         generation_mode
     ], outputs = [
+       text_to_video_hint, image_position, input_image, end_image, end_stillness, input_video, start_button, start_button_video, no_resize, num_clean_frames, vae_batch, prompt_hint, fps_number
     ]
     )