Fabrice-TIERCELIN commited on
Commit
e6fa5f4
·
verified ·
1 Parent(s): 5990b3a

Image at any position

Browse files
Files changed (1) hide show
  1. app.py +185 -130
app.py CHANGED
@@ -57,8 +57,8 @@ if torch.cuda.device_count() > 0:
57
  free_mem_gb = get_cuda_free_memory_gb(gpu)
58
  high_vram = free_mem_gb > 60
59
 
60
- print(f'Free VRAM {free_mem_gb} GB')
61
- print(f'High-VRAM Mode: {high_vram}')
62
 
63
  text_encoder = LlamaModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder', torch_dtype=torch.float16).cpu()
64
  text_encoder_2 = CLIPTextModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder_2', torch_dtype=torch.float16).cpu()
@@ -82,7 +82,7 @@ if torch.cuda.device_count() > 0:
82
  vae.enable_tiling()
83
 
84
  transformer.high_quality_fp32_output_for_inference = True
85
- print('transformer.high_quality_fp32_output_for_inference = True')
86
 
87
  transformer.to(dtype=torch.bfloat16)
88
  vae.to(dtype=torch.float16)
@@ -136,36 +136,36 @@ def video_encode(video_path, resolution, no_resize, vae, vae_batch_size=16, devi
136
  """
137
  # 20250506 pftq: Normalize video path for Windows compatibility
138
  video_path = str(pathlib.Path(video_path).resolve())
139
- print(f"Processing video: {video_path}")
140
 
141
  # 20250506 pftq: Check CUDA availability and fallback to CPU if needed
142
  if device == "cuda" and not torch.cuda.is_available():
143
- print("CUDA is not available, falling back to CPU")
144
  device = "cpu"
145
 
146
  try:
147
  # 20250506 pftq: Load video and get FPS
148
- print("Initializing VideoReader...")
149
  vr = decord.VideoReader(video_path)
150
  fps = vr.get_avg_fps() # Get input video FPS
151
  num_real_frames = len(vr)
152
- print(f"Video loaded: {num_real_frames} frames, FPS: {fps}")
153
 
154
  # Truncate to nearest latent size (multiple of 4)
155
  latent_size_factor = 4
156
  num_frames = (num_real_frames // latent_size_factor) * latent_size_factor
157
- if num_frames != num_real_frames:
158
- print(f"Truncating video from {num_real_frames} to {num_frames} frames for latent size compatibility")
159
  num_real_frames = num_frames
160
 
161
  # 20250506 pftq: Read frames
162
- print("Reading video frames...")
163
  frames = vr.get_batch(range(num_real_frames)).asnumpy() # Shape: (num_real_frames, height, width, channels)
164
- print(f"Frames read: {frames.shape}")
165
 
166
  # 20250506 pftq: Get native video resolution
167
  native_height, native_width = frames.shape[1], frames.shape[2]
168
- print(f"Native video resolution: {native_width}x{native_height}")
169
 
170
  # 20250506 pftq: Use native resolution if height/width not specified, otherwise use provided values
171
  target_height = native_height if height is None else height
@@ -174,9 +174,9 @@ def video_encode(video_path, resolution, no_resize, vae, vae_batch_size=16, devi
174
  # 20250506 pftq: Adjust to nearest bucket for model compatibility
175
  if not no_resize:
176
  target_height, target_width = find_nearest_bucket(target_height, target_width, resolution=resolution)
177
- print(f"Adjusted resolution: {target_width}x{target_height}")
178
- else:
179
- print(f"Using native resolution without resizing: {target_width}x{target_height}")
180
 
181
  # 20250506 pftq: Preprocess frames to match original image processing
182
  processed_frames = []
@@ -185,34 +185,34 @@ def video_encode(video_path, resolution, no_resize, vae, vae_batch_size=16, devi
185
  frame_np = resize_and_center_crop(frame, target_width=target_width, target_height=target_height)
186
  processed_frames.append(frame_np)
187
  processed_frames = np.stack(processed_frames) # Shape: (num_real_frames, height, width, channels)
188
- print(f"Frames preprocessed: {processed_frames.shape}")
189
 
190
  # 20250506 pftq: Save first frame for CLIP vision encoding
191
  input_image_np = processed_frames[0]
192
 
193
  # 20250506 pftq: Convert to tensor and normalize to [-1, 1]
194
- print("Converting frames to tensor...")
195
  frames_pt = torch.from_numpy(processed_frames).float() / 127.5 - 1
196
  frames_pt = frames_pt.permute(0, 3, 1, 2) # Shape: (num_real_frames, channels, height, width)
197
  frames_pt = frames_pt.unsqueeze(0) # Shape: (1, num_real_frames, channels, height, width)
198
  frames_pt = frames_pt.permute(0, 2, 1, 3, 4) # Shape: (1, channels, num_real_frames, height, width)
199
- print(f"Tensor shape: {frames_pt.shape}")
200
 
201
  # 20250507 pftq: Save pixel frames for use in worker
202
  input_video_pixels = frames_pt.cpu()
203
 
204
  # 20250506 pftq: Move to device
205
- print(f"Moving tensor to device: {device}")
206
  frames_pt = frames_pt.to(device)
207
- print("Tensor moved to device")
208
 
209
  # 20250506 pftq: Move VAE to device
210
- print(f"Moving VAE to device: {device}")
211
  vae.to(device)
212
- print("VAE moved to device")
213
 
214
  # 20250506 pftq: Encode frames in batches
215
- print(f"Encoding input video frames in VAE batch size {vae_batch_size} (reduce if memory issues here or if forcing video resolution)")
216
  latents = []
217
  vae.eval()
218
  with torch.no_grad():
@@ -238,19 +238,19 @@ def video_encode(video_path, resolution, no_resize, vae, vae_batch_size=16, devi
238
  raise
239
 
240
  # 20250506 pftq: Concatenate latents
241
- print("Concatenating latents...")
242
  history_latents = torch.cat(latents, dim=2) # Shape: (1, channels, frames, height//8, width//8)
243
- print(f"History latents shape: {history_latents.shape}")
244
 
245
  # 20250506 pftq: Get first frame's latent
246
  start_latent = history_latents[:, :, :1] # Shape: (1, channels, 1, height//8, width//8)
247
- print(f"Start latent shape: {start_latent.shape}")
248
 
249
  # 20250506 pftq: Move VAE back to CPU to free GPU memory
250
  if device == "cuda":
251
  vae.to(cpu)
252
  torch.cuda.empty_cache()
253
- print("VAE moved back to CPU, CUDA cache cleared")
254
 
255
  return start_latent, input_image_np, history_latents, fps, target_height, target_width, input_video_pixels
256
 
@@ -266,7 +266,7 @@ def set_mp4_comments_imageio_ffmpeg(input_file, comments):
266
 
267
  # Check if input file exists
268
  if not os.path.exists(input_file):
269
- print(f"Error: Input file {input_file} does not exist")
270
  return False
271
 
272
  # Create a temporary file path
@@ -289,13 +289,13 @@ def set_mp4_comments_imageio_ffmpeg(input_file, comments):
289
  if result.returncode == 0:
290
  # Replace the original file with the modified one
291
  shutil.move(temp_file, input_file)
292
- print(f"Successfully added comments to {input_file}")
293
  return True
294
  else:
295
  # Clean up temp file if FFmpeg fails
296
  if os.path.exists(temp_file):
297
  os.remove(temp_file)
298
- print(f"Error: FFmpeg failed with message:\n{result.stderr}")
299
  return False
300
 
301
  except Exception as e:
@@ -306,8 +306,7 @@ def set_mp4_comments_imageio_ffmpeg(input_file, comments):
306
  return False
307
 
308
  @torch.no_grad()
309
- def worker(input_image, image_position, prompts, n_prompt, seed, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf):
310
- is_last_frame = (image_position == 100)
311
  def encode_prompt(prompt, n_prompt):
312
  llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
313
 
@@ -325,9 +324,13 @@ def worker(input_image, image_position, prompts, n_prompt, seed, resolution, tot
325
  clip_l_pooler_n = clip_l_pooler_n.to(transformer.dtype)
326
  return [llama_vec, clip_l_pooler, llama_vec_n, clip_l_pooler_n, llama_attention_mask, llama_attention_mask_n]
327
 
328
- total_latent_sections = (total_second_length * 30) / (latent_window_size * 4)
329
  total_latent_sections = int(max(round(total_latent_sections), 1))
330
 
 
 
 
 
331
  job_id = generate_timestamp()
332
 
333
  stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
@@ -349,9 +352,15 @@ def worker(input_image, image_position, prompts, n_prompt, seed, resolution, tot
349
 
350
  prompt_parameters = []
351
 
352
- for prompt_part in prompts:
353
  prompt_parameters.append(encode_prompt(prompt_part, n_prompt))
354
 
 
 
 
 
 
 
355
  # Processing input image
356
 
357
  stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Image processing ...'))))
@@ -403,7 +412,7 @@ def worker(input_image, image_position, prompts, n_prompt, seed, resolution, tot
403
  start_latent = start_latent.to(history_latents)
404
  history_pixels = None
405
 
406
- history_latents = torch.cat([start_latent, history_latents] if is_last_frame else [history_latents, start_latent], dim=2)
407
  total_generated_latent_frames = 1
408
 
409
  if enable_preview:
@@ -421,65 +430,69 @@ def worker(input_image, image_position, prompts, n_prompt, seed, resolution, tot
421
  current_step = d['i'] + 1
422
  percentage = int(100.0 * current_step / steps)
423
  hint = f'Sampling {current_step}/{steps}'
424
- desc = f'Total generated frames: {int(max(0, total_generated_latent_frames * 4 - 3))}, Video length: {max(0, (total_generated_latent_frames * 4 - 3) / 30) :.2f} seconds (FPS-30), Resolution: {height}px * {width}px. The video is being extended now ...'
425
  stream.output_queue.push(('progress', (preview, desc, make_progress_bar_html(percentage, hint))))
426
  return
427
  else:
428
  def callback(d):
429
  return
430
 
431
- indices = torch.arange(0, sum([1, 16, 2, 1, latent_window_size])).unsqueeze(0)
432
- if is_last_frame:
433
- latent_indices, clean_latent_1x_indices, clean_latent_2x_indices, clean_latent_4x_indices, clean_latent_indices_start = indices.split([latent_window_size, 1, 2, 16, 1], dim=1)
434
- clean_latent_indices = torch.cat([clean_latent_1x_indices, clean_latent_indices_start], dim=1)
435
- else:
436
  clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices = indices.split([1, 16, 2, 1, latent_window_size], dim=1)
437
  clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
 
 
 
438
 
439
- def post_process(generated_latents, total_generated_latent_frames, history_latents, high_vram, transformer, gpu, vae, history_pixels, latent_window_size, enable_preview, section_index, total_latent_sections, outputs_folder, mp4_crf, stream):
440
  total_generated_latent_frames += int(generated_latents.shape[2])
441
- history_latents = torch.cat([generated_latents.to(history_latents), history_latents], dim=2) if is_last_frame else torch.cat([history_latents, generated_latents.to(history_latents)], dim=2)
442
 
443
  if not high_vram:
444
  offload_model_from_device_for_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=8)
445
  load_model_as_complete(vae, target_device=gpu)
446
 
447
  if history_pixels is None:
448
- real_history_latents = history_latents[:, :, :total_generated_latent_frames, :, :] if is_last_frame else history_latents[:, :, -total_generated_latent_frames:, :, :]
449
  history_pixels = vae_decode(real_history_latents, vae).cpu()
450
  else:
451
  section_latent_frames = latent_window_size * 2
452
  overlapped_frames = latent_window_size * 4 - 3
453
 
454
- if is_last_frame:
455
- real_history_latents = history_latents[:, :, :min(section_latent_frames, total_generated_latent_frames), :, :]
456
- history_pixels = soft_append_bcthw(vae_decode(real_history_latents, vae).cpu(), history_pixels, overlapped_frames)
457
- else:
458
  real_history_latents = history_latents[:, :, -min(section_latent_frames, total_generated_latent_frames):, :, :]
459
  history_pixels = soft_append_bcthw(history_pixels, vae_decode(real_history_latents, vae).cpu(), overlapped_frames)
 
 
 
460
 
461
  if not high_vram:
462
  unload_complete_models()
463
 
464
- if enable_preview or section_index == (0 if is_last_frame else (total_latent_sections - 1)):
465
  output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
466
 
467
- save_bcthw_as_mp4(history_pixels, output_filename, fps=30, crf=mp4_crf)
468
 
469
  print(f'Decoded. Current latent shape pixel shape {history_pixels.shape}')
470
 
471
  stream.output_queue.push(('file', output_filename))
472
  return [total_generated_latent_frames, history_latents, history_pixels]
473
-
474
- for section_index in range(total_latent_sections - 1, -1, -1) if is_last_frame else range(total_latent_sections):
475
  if stream.input_queue.top() == 'end':
476
  stream.output_queue.push(('end', None))
477
  return
478
 
479
  print(f'section_index = {section_index}, total_latent_sections = {total_latent_sections}')
480
 
481
- if len(prompt_parameters) > 0:
482
- [llama_vec, clip_l_pooler, llama_vec_n, clip_l_pooler_n, llama_attention_mask, llama_attention_mask_n] = prompt_parameters.pop((len(prompt_parameters) - 1) if is_last_frame else 0)
 
 
 
 
483
 
484
  if not high_vram:
485
  unload_complete_models()
@@ -490,12 +503,12 @@ def worker(input_image, image_position, prompts, n_prompt, seed, resolution, tot
490
  else:
491
  transformer.initialize_teacache(enable_teacache=False)
492
 
493
- if is_last_frame:
494
- clean_latents_1x, clean_latents_2x, clean_latents_4x = history_latents[:, :, :sum([1, 2, 16]), :, :].split([1, 2, 16], dim=2)
495
- clean_latents = torch.cat([clean_latents_1x, start_latent], dim=2)
496
- else:
497
- clean_latents_4x, clean_latents_2x, clean_latents_1x = history_latents[:, :, -sum([16, 2, 1]):, :, :].split([16, 2, 1], dim=2)
498
  clean_latents = torch.cat([start_latent, clean_latents_1x], dim=2)
 
 
 
499
 
500
  generated_latents = sample_hunyuan(
501
  transformer=transformer,
@@ -528,7 +541,25 @@ def worker(input_image, image_position, prompts, n_prompt, seed, resolution, tot
528
  callback=callback,
529
  )
530
 
531
- [total_generated_latent_frames, history_latents, history_pixels] = post_process(generated_latents, total_generated_latent_frames, history_latents, high_vram, transformer, gpu, vae, history_pixels, latent_window_size, enable_preview, section_index, total_latent_sections, outputs_folder, mp4_crf, stream)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
532
  except:
533
  traceback.print_exc()
534
 
@@ -563,6 +594,17 @@ def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_
563
  stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
564
 
565
  try:
 
 
 
 
 
 
 
 
 
 
 
566
  # Clean GPU
567
  if not high_vram:
568
  unload_complete_models(
@@ -578,16 +620,14 @@ def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_
578
 
579
  prompt_parameters = []
580
 
581
- for prompt_part in prompts:
582
  prompt_parameters.append(encode_prompt(prompt_part, n_prompt))
583
 
584
- # 20250506 pftq: Processing input video instead of image
585
- stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Video processing ...'))))
586
-
587
- # 20250506 pftq: Encode video
588
- start_latent, input_image_np, video_latents, fps, height, width = video_encode(input_video, resolution, no_resize, vae, vae_batch_size=vae_batch, device=gpu)[:6]
589
- start_latent = start_latent.to(dtype=torch.float32).cpu()
590
- video_latents = video_latents.cpu()
591
 
592
  # CLIP Vision
593
  stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'CLIP Vision encoding ...'))))
@@ -601,9 +641,6 @@ def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_
601
  # Dtype
602
  image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
603
 
604
- total_latent_sections = (total_second_length * fps) / (latent_window_size * 4)
605
- total_latent_sections = int(max(round(total_latent_sections), 1))
606
-
607
  if enable_preview:
608
  def callback(d):
609
  preview = d['denoised']
@@ -640,7 +677,7 @@ def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_
640
  total_context_frames = num_4x_frames + num_2x_frames + effective_clean_frames
641
  total_context_frames = min(total_context_frames, available_frames) # 20250507 pftq: Edge case for <=1 sec videos
642
 
643
- indices = torch.arange(0, sum([1, num_4x_frames, num_2x_frames, effective_clean_frames, adjusted_latent_frames])).unsqueeze(0) # 20250507 pftq: latent_window_size to adjusted_latent_frames for edge case for <=1 sec videos
644
  clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices = indices.split(
645
  [1, num_4x_frames, num_2x_frames, effective_clean_frames, adjusted_latent_frames], dim=1 # 20250507 pftq: latent_window_size to adjusted_latent_frames for edge case for <=1 sec videos
646
  )
@@ -809,34 +846,18 @@ def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_
809
  stream.output_queue.push(('end', None))
810
  return
811
 
812
- def get_duration(input_image, image_position, prompts, generation_mode, n_prompt, seed, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf):
813
- return total_second_length * 60 * (0.9 if use_teacache else 1.5) * (1 + ((steps - 25) / 100))
814
 
815
  # Remove this decorator if you run on local
816
  @spaces.GPU(duration=get_duration)
817
- def process_on_gpu(input_image,
818
- image_position=0,
819
- prompts=[""],
820
- generation_mode="image",
821
- n_prompt="",
822
- seed=31337,
823
- resolution=640,
824
- total_second_length=5,
825
- latent_window_size=9,
826
- steps=25,
827
- cfg=1.0,
828
- gs=10.0,
829
- rs=0.0,
830
- gpu_memory_preservation=6,
831
- enable_preview=True,
832
- use_teacache=False,
833
- mp4_crf=16
834
  ):
835
  start = time.time()
836
  global stream
837
  stream = AsyncStream()
838
 
839
- async_run(worker, input_image, image_position, prompts, n_prompt, seed, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf)
840
 
841
  output_filename = None
842
 
@@ -845,11 +866,11 @@ def process_on_gpu(input_image,
845
 
846
  if flag == 'file':
847
  output_filename = data
848
- yield gr.update(value=output_filename, label="Previewed Frames"), gr.update(), gr.update(), gr.update(), gr.update(interactive=False), gr.update(interactive=True), gr.update()
849
 
850
  if flag == 'progress':
851
  preview, desc, html = data
852
- yield gr.update(label="Previewed Frames"), gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True), gr.update()
853
 
854
  if flag == 'end':
855
  end = time.time()
@@ -858,7 +879,7 @@ def process_on_gpu(input_image,
858
  secondes = secondes - (minutes * 60)
859
  hours = math.floor(minutes / 60)
860
  minutes = minutes - (hours * 60)
861
- yield gr.update(value=output_filename, label="Finished Frames"), gr.update(visible=False), gr.update(), "The process has lasted " + \
862
  ((str(hours) + " h, ") if hours != 0 else "") + \
863
  ((str(minutes) + " min, ") if hours != 0 or minutes != 0 else "") + \
864
  str(secondes) + " sec. " + \
@@ -872,6 +893,8 @@ def process(input_image,
872
  n_prompt="",
873
  randomize_seed=True,
874
  seed=31337,
 
 
875
  resolution=640,
876
  total_second_length=5,
877
  latent_window_size=9,
@@ -882,12 +905,15 @@ def process(input_image,
882
  gpu_memory_preservation=6,
883
  enable_preview=True,
884
  use_teacache=False,
885
- mp4_crf=16
 
886
  ):
 
 
887
 
888
  if torch.cuda.device_count() == 0:
889
  gr.Warning('Set this space to GPU config to make it work.')
890
- yield gr.update(), gr.update(), gr.update(), gr.update(), gr.update(), gr.update(), gr.update(visible = False)
891
  return
892
 
893
  if randomize_seed:
@@ -901,7 +927,7 @@ def process(input_image,
901
  input_image = np.ones((default_height, default_width, 3), dtype=np.uint8) * 255
902
  print("No input image provided. Using a blank white image.")
903
 
904
- yield gr.update(label="Previewed Frames"), None, '', '', gr.update(interactive=False), gr.update(interactive=True), gr.update()
905
 
906
  yield from process_on_gpu(input_image,
907
  image_position,
@@ -911,6 +937,7 @@ def process(input_image,
911
  seed,
912
  resolution,
913
  total_second_length,
 
914
  latent_window_size,
915
  steps,
916
  cfg,
@@ -919,15 +946,16 @@ def process(input_image,
919
  gpu_memory_preservation,
920
  enable_preview,
921
  use_teacache,
922
- mp4_crf
 
923
  )
924
 
925
- def get_duration_video(input_video, prompts, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
926
- return total_second_length * 60 * (1.5 if use_teacache else 2.5) * (1 + ((steps - 25) / 100))
927
 
928
  # Remove this decorator if you run on local
929
  @spaces.GPU(duration=get_duration_video)
930
- def process_video_on_gpu(input_video, prompts, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
931
  start = time.time()
932
  global stream
933
  stream = AsyncStream()
@@ -942,11 +970,11 @@ def process_video_on_gpu(input_video, prompts, n_prompt, seed, batch, resolution
942
 
943
  if flag == 'file':
944
  output_filename = data
945
- yield gr.update(value=output_filename, label="Previewed Frames"), gr.update(), gr.update(), gr.update(), gr.update(interactive=False), gr.update(interactive=True), gr.update()
946
 
947
  if flag == 'progress':
948
  preview, desc, html = data
949
- yield gr.update(label="Previewed Frames"), gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True), gr.update() # 20250506 pftq: Keep refreshing the video in case it got hidden when the tab was in the background
950
 
951
  if flag == 'end':
952
  end = time.time()
@@ -963,12 +991,14 @@ def process_video_on_gpu(input_video, prompts, n_prompt, seed, batch, resolution
963
  " You can upscale the result with RIFE. To make all your generated scenes consistent, you can then apply a face swap on the main character. If you do not see the generated video above, the process may have failed. See the logs for more information. If you see an error like ''NVML_SUCCESS == r INTERNAL ASSERT FAILED'', you probably haven't enough VRAM. Test an example or other options to compare. You can share your inputs to the original space or set your space in public for a peer review.", '', gr.update(interactive=True), gr.update(interactive=False), gr.update(visible = False)
964
  break
965
 
966
- def process_video(input_video, prompt, n_prompt, randomize_seed, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
967
  global high_vram
 
 
968
 
969
  if torch.cuda.device_count() == 0:
970
  gr.Warning('Set this space to GPU config to make it work.')
971
- yield gr.update(), gr.update(), gr.update(), gr.update(), gr.update(), gr.update(), gr.update(visible = False)
972
  return
973
 
974
  if randomize_seed:
@@ -979,7 +1009,7 @@ def process_video(input_video, prompt, n_prompt, randomize_seed, seed, batch, re
979
  # 20250506 pftq: Updated assertion for video input
980
  assert input_video is not None, 'No input video!'
981
 
982
- yield gr.update(label="Previewed Frames"), None, '', '', gr.update(interactive=False), gr.update(interactive=True), gr.update()
983
 
984
  # 20250507 pftq: Even the H100 needs offloading if the video dimensions are 720p or higher
985
  if high_vram and (no_resize or resolution>640):
@@ -994,7 +1024,7 @@ def process_video(input_video, prompt, n_prompt, randomize_seed, seed, batch, re
994
  if cfg > 1:
995
  gs = 1
996
 
997
- yield from process_video_on_gpu(input_video, prompts, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch)
998
 
999
  def end_process():
1000
  stream.input_queue.push('end')
@@ -1065,9 +1095,9 @@ with block:
1065
  with gr.Row():
1066
  with gr.Column():
1067
  generation_mode = gr.Radio([["Text-to-Video", "text"], ["Image-to-Video", "image"], ["Video Extension", "video"]], elem_id="generation-mode", label="Generation mode", value = "image")
1068
- text_to_video_hint = gr.HTML("I discourage to use the Text-to-Video feature. You should rather generate an image with Flux and use Image-to-Video. You will save time.")
1069
  input_image = gr.Image(sources='upload', type="numpy", label="Image", height=320)
1070
- image_position = gr.Slider(label="Image position", minimum=0, maximum=100, value=0, step=100, info='0=Video start; 100=Video end (lower quality)')
1071
  input_video = gr.Video(sources='upload', label="Input Video", height=320)
1072
  timeless_prompt = gr.Textbox(label="Timeless prompt", info='Used on the whole duration of the generation', value='', placeholder="The creature starts to move, fast motion, fixed camera, focus motion, consistent arm, consistent position, mute colors, insanely detailed")
1073
  prompt_number = gr.Slider(label="Timed prompt number", minimum=0, maximum=1000, value=0, step=1, info='Prompts will automatically appear')
@@ -1081,7 +1111,7 @@ with block:
1081
 
1082
  final_prompt = gr.Textbox(label="Final prompt", value='', info='Use ; to separate in time; beware to write to stop the previous action')
1083
  prompt_hint = gr.HTML("Video extension barely follows the prompt; to force to follow the prompt, you have to set the Distilled CFG Scale to 3.0 and the Context Frames to 2 but the video quality will be poor.")
1084
- total_second_length = gr.Slider(label="Video Length to Generate (seconds)", minimum=1, maximum=120, value=2, step=0.1)
1085
 
1086
  with gr.Row():
1087
  start_button = gr.Button(value="🎥 Generate", variant="primary")
@@ -1094,6 +1124,8 @@ with block:
1094
 
1095
  n_prompt = gr.Textbox(label="Negative Prompt", value="Missing arm, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", info='Requires using normal CFG (undistilled) instead of Distilled (set Distilled=1 and CFG > 1).')
1096
 
 
 
1097
  latent_window_size = gr.Slider(label="Latent Window Size", minimum=1, maximum=33, value=9, step=1, info='Generate more frames at a time (larger chunks). Less degradation and better blending but higher VRAM cost. Should not change.')
1098
  steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=30, step=1, info='Increase for more quality, especially if using high non-distilled CFG. If your animation has very few motion, you may have brutal brightness change; this can be fixed increasing the steps.')
1099
 
@@ -1134,6 +1166,9 @@ with block:
1134
  with gr.Row():
1135
  randomize_seed = gr.Checkbox(label='Randomize seed', value=True, info='If checked, the seed is always different')
1136
  seed = gr.Slider(label="Seed", minimum=0, maximum=np.iinfo(np.int32).max, step=1, randomize=True)
 
 
 
1137
 
1138
  with gr.Column():
1139
  warning = gr.HTML(value = "<center><big>Your computer must <u>not</u> enter into standby mode.</big><br/>On Chrome, you can force to keep a tab alive in <code>chrome://discards/</code></center>", visible = False)
@@ -1143,11 +1178,11 @@ with block:
1143
  progress_bar = gr.HTML('', elem_classes='no-generating-animation')
1144
 
1145
  # 20250506 pftq: Updated inputs to include num_clean_frames
1146
- ips = [input_image, image_position, final_prompt, generation_mode, n_prompt, randomize_seed, seed, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf]
1147
- ips_video = [input_video, final_prompt, n_prompt, randomize_seed, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch]
1148
 
1149
  gr.Examples(
1150
- label = "Examples from text",
1151
  examples = [
1152
  [
1153
  None, # input_image
@@ -1157,6 +1192,8 @@ with block:
1157
  "Missing arm, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
1158
  True, # randomize_seed
1159
  42, # seed
 
 
1160
  672, # resolution
1161
  1, # total_second_length
1162
  9, # latent_window_size
@@ -1167,18 +1204,19 @@ with block:
1167
  6, # gpu_memory_preservation
1168
  False, # enable_preview
1169
  False, # use_teacache
1170
- 16 # mp4_crf
 
1171
  ]
1172
  ],
1173
  run_on_click = True,
1174
  fn = process,
1175
  inputs = ips,
1176
- outputs = [result_video, preview_image, progress_desc, progress_bar, start_button, end_button],
1177
  cache_examples = False,
1178
  )
1179
 
1180
  gr.Examples(
1181
- label = "Examples from image",
1182
  examples = [
1183
  [
1184
  "./img_examples/Example1.png", # input_image
@@ -1188,6 +1226,8 @@ with block:
1188
  "Missing arm, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
1189
  True, # randomize_seed
1190
  42, # seed
 
 
1191
  672, # resolution
1192
  1, # total_second_length
1193
  9, # latent_window_size
@@ -1198,7 +1238,8 @@ with block:
1198
  6, # gpu_memory_preservation
1199
  False, # enable_preview
1200
  True, # use_teacache
1201
- 16 # mp4_crf
 
1202
  ],
1203
  [
1204
  "./img_examples/Example2.webp", # input_image
@@ -1208,6 +1249,8 @@ with block:
1208
  "Missing arm, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
1209
  True, # randomize_seed
1210
  42, # seed
 
 
1211
  672, # resolution
1212
  2, # total_second_length
1213
  9, # latent_window_size
@@ -1218,7 +1261,8 @@ with block:
1218
  6, # gpu_memory_preservation
1219
  False, # enable_preview
1220
  True, # use_teacache
1221
- 16 # mp4_crf
 
1222
  ],
1223
  [
1224
  "./img_examples/Example2.webp", # input_image
@@ -1228,6 +1272,8 @@ with block:
1228
  "Missing arm, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
1229
  True, # randomize_seed
1230
  42, # seed
 
 
1231
  672, # resolution
1232
  2, # total_second_length
1233
  9, # latent_window_size
@@ -1238,7 +1284,8 @@ with block:
1238
  6, # gpu_memory_preservation
1239
  False, # enable_preview
1240
  True, # use_teacache
1241
- 16 # mp4_crf
 
1242
  ],
1243
  [
1244
  "./img_examples/Example3.jpg", # input_image
@@ -1248,6 +1295,8 @@ with block:
1248
  "Missing arm, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
1249
  True, # randomize_seed
1250
  42, # seed
 
 
1251
  672, # resolution
1252
  1, # total_second_length
1253
  9, # latent_window_size
@@ -1258,7 +1307,8 @@ with block:
1258
  6, # gpu_memory_preservation
1259
  False, # enable_preview
1260
  True, # use_teacache
1261
- 16 # mp4_crf
 
1262
  ],
1263
  [
1264
  "./img_examples/Example4.webp", # input_image
@@ -1268,6 +1318,8 @@ with block:
1268
  "Missing arm, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
1269
  True, # randomize_seed
1270
  42, # seed
 
 
1271
  672, # resolution
1272
  1, # total_second_length
1273
  9, # latent_window_size
@@ -1278,18 +1330,19 @@ with block:
1278
  6, # gpu_memory_preservation
1279
  False, # enable_preview
1280
  False, # use_teacache
1281
- 16 # mp4_crf
 
1282
  ]
1283
  ],
1284
  run_on_click = True,
1285
  fn = process,
1286
  inputs = ips,
1287
- outputs = [result_video, preview_image, progress_desc, progress_bar, start_button, end_button],
1288
  cache_examples = False,
1289
  )
1290
 
1291
  gr.Examples(
1292
- label = "Examples from video",
1293
  examples = [
1294
  [
1295
  "./img_examples/Example1.mp4", # input_video
@@ -1297,6 +1350,8 @@ with block:
1297
  "Missing arm, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
1298
  True, # randomize_seed
1299
  42, # seed
 
 
1300
  1, # batch
1301
  672, # resolution
1302
  1, # total_second_length
@@ -1317,7 +1372,7 @@ with block:
1317
  run_on_click = True,
1318
  fn = process_video,
1319
  inputs = ips_video,
1320
- outputs = [result_video, preview_image, progress_desc, progress_bar, start_button_video, end_button],
1321
  cache_examples = False,
1322
  )
1323
 
@@ -1343,11 +1398,11 @@ with block:
1343
 
1344
  def handle_generation_mode_change(generation_mode_data):
1345
  if generation_mode_data == "text":
1346
- return [gr.update(visible = True), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = True), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False)]
1347
  elif generation_mode_data == "image":
1348
- return [gr.update(visible = False), gr.update(visible = True), gr.update(visible = True), gr.update(visible = False), gr.update(visible = True), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False)]
1349
  elif generation_mode_data == "video":
1350
- return [gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = True), gr.update(visible = False), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True)]
1351
 
1352
  prompt_number.change(fn=handle_prompt_number_change, inputs=[], outputs=[])
1353
  timeless_prompt.change(fn=handle_timeless_prompt_change, inputs=[timeless_prompt], outputs=[final_prompt])
@@ -1369,7 +1424,7 @@ with block:
1369
  generation_mode.change(
1370
  fn=handle_generation_mode_change,
1371
  inputs=[generation_mode],
1372
- outputs=[text_to_video_hint, image_position, input_image, input_video, start_button, start_button_video, no_resize, batch, num_clean_frames, vae_batch, prompt_hint]
1373
  )
1374
 
1375
  # Update display when the page loads
@@ -1377,7 +1432,7 @@ with block:
1377
  fn=handle_generation_mode_change, inputs = [
1378
  generation_mode
1379
  ], outputs = [
1380
- text_to_video_hint, image_position, input_image, input_video, start_button, start_button_video, no_resize, batch, num_clean_frames, vae_batch, prompt_hint
1381
  ]
1382
  )
1383
 
 
57
  free_mem_gb = get_cuda_free_memory_gb(gpu)
58
  high_vram = free_mem_gb > 60
59
 
60
+ #print(f'Free VRAM {free_mem_gb} GB')
61
+ #print(f'High-VRAM Mode: {high_vram}')
62
 
63
  text_encoder = LlamaModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder', torch_dtype=torch.float16).cpu()
64
  text_encoder_2 = CLIPTextModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder_2', torch_dtype=torch.float16).cpu()
 
82
  vae.enable_tiling()
83
 
84
  transformer.high_quality_fp32_output_for_inference = True
85
+ #print('transformer.high_quality_fp32_output_for_inference = True')
86
 
87
  transformer.to(dtype=torch.bfloat16)
88
  vae.to(dtype=torch.float16)
 
136
  """
137
  # 20250506 pftq: Normalize video path for Windows compatibility
138
  video_path = str(pathlib.Path(video_path).resolve())
139
+ #print(f"Processing video: {video_path}")
140
 
141
  # 20250506 pftq: Check CUDA availability and fallback to CPU if needed
142
  if device == "cuda" and not torch.cuda.is_available():
143
+ #print("CUDA is not available, falling back to CPU")
144
  device = "cpu"
145
 
146
  try:
147
  # 20250506 pftq: Load video and get FPS
148
+ #print("Initializing VideoReader...")
149
  vr = decord.VideoReader(video_path)
150
  fps = vr.get_avg_fps() # Get input video FPS
151
  num_real_frames = len(vr)
152
+ #print(f"Video loaded: {num_real_frames} frames, FPS: {fps}")
153
 
154
  # Truncate to nearest latent size (multiple of 4)
155
  latent_size_factor = 4
156
  num_frames = (num_real_frames // latent_size_factor) * latent_size_factor
157
+ #if num_frames != num_real_frames:
158
+ #print(f"Truncating video from {num_real_frames} to {num_frames} frames for latent size compatibility")
159
  num_real_frames = num_frames
160
 
161
  # 20250506 pftq: Read frames
162
+ #print("Reading video frames...")
163
  frames = vr.get_batch(range(num_real_frames)).asnumpy() # Shape: (num_real_frames, height, width, channels)
164
+ #print(f"Frames read: {frames.shape}")
165
 
166
  # 20250506 pftq: Get native video resolution
167
  native_height, native_width = frames.shape[1], frames.shape[2]
168
+ #print(f"Native video resolution: {native_width}x{native_height}")
169
 
170
  # 20250506 pftq: Use native resolution if height/width not specified, otherwise use provided values
171
  target_height = native_height if height is None else height
 
174
  # 20250506 pftq: Adjust to nearest bucket for model compatibility
175
  if not no_resize:
176
  target_height, target_width = find_nearest_bucket(target_height, target_width, resolution=resolution)
177
+ #print(f"Adjusted resolution: {target_width}x{target_height}")
178
+ #else:
179
+ #print(f"Using native resolution without resizing: {target_width}x{target_height}")
180
 
181
  # 20250506 pftq: Preprocess frames to match original image processing
182
  processed_frames = []
 
185
  frame_np = resize_and_center_crop(frame, target_width=target_width, target_height=target_height)
186
  processed_frames.append(frame_np)
187
  processed_frames = np.stack(processed_frames) # Shape: (num_real_frames, height, width, channels)
188
+ #print(f"Frames preprocessed: {processed_frames.shape}")
189
 
190
  # 20250506 pftq: Save first frame for CLIP vision encoding
191
  input_image_np = processed_frames[0]
192
 
193
  # 20250506 pftq: Convert to tensor and normalize to [-1, 1]
194
+ #print("Converting frames to tensor...")
195
  frames_pt = torch.from_numpy(processed_frames).float() / 127.5 - 1
196
  frames_pt = frames_pt.permute(0, 3, 1, 2) # Shape: (num_real_frames, channels, height, width)
197
  frames_pt = frames_pt.unsqueeze(0) # Shape: (1, num_real_frames, channels, height, width)
198
  frames_pt = frames_pt.permute(0, 2, 1, 3, 4) # Shape: (1, channels, num_real_frames, height, width)
199
+ #print(f"Tensor shape: {frames_pt.shape}")
200
 
201
  # 20250507 pftq: Save pixel frames for use in worker
202
  input_video_pixels = frames_pt.cpu()
203
 
204
  # 20250506 pftq: Move to device
205
+ #print(f"Moving tensor to device: {device}")
206
  frames_pt = frames_pt.to(device)
207
+ #print("Tensor moved to device")
208
 
209
  # 20250506 pftq: Move VAE to device
210
+ #print(f"Moving VAE to device: {device}")
211
  vae.to(device)
212
+ #print("VAE moved to device")
213
 
214
  # 20250506 pftq: Encode frames in batches
215
+ #print(f"Encoding input video frames in VAE batch size {vae_batch_size} (reduce if memory issues here or if forcing video resolution)")
216
  latents = []
217
  vae.eval()
218
  with torch.no_grad():
 
238
  raise
239
 
240
  # 20250506 pftq: Concatenate latents
241
+ #print("Concatenating latents...")
242
  history_latents = torch.cat(latents, dim=2) # Shape: (1, channels, frames, height//8, width//8)
243
+ #print(f"History latents shape: {history_latents.shape}")
244
 
245
  # 20250506 pftq: Get first frame's latent
246
  start_latent = history_latents[:, :, :1] # Shape: (1, channels, 1, height//8, width//8)
247
+ #print(f"Start latent shape: {start_latent.shape}")
248
 
249
  # 20250506 pftq: Move VAE back to CPU to free GPU memory
250
  if device == "cuda":
251
  vae.to(cpu)
252
  torch.cuda.empty_cache()
253
+ #print("VAE moved back to CPU, CUDA cache cleared")
254
 
255
  return start_latent, input_image_np, history_latents, fps, target_height, target_width, input_video_pixels
256
 
 
266
 
267
  # Check if input file exists
268
  if not os.path.exists(input_file):
269
+ #print(f"Error: Input file {input_file} does not exist")
270
  return False
271
 
272
  # Create a temporary file path
 
289
  if result.returncode == 0:
290
  # Replace the original file with the modified one
291
  shutil.move(temp_file, input_file)
292
+ #print(f"Successfully added comments to {input_file}")
293
  return True
294
  else:
295
  # Clean up temp file if FFmpeg fails
296
  if os.path.exists(temp_file):
297
  os.remove(temp_file)
298
+ #print(f"Error: FFmpeg failed with message:\n{result.stderr}")
299
  return False
300
 
301
  except Exception as e:
 
306
  return False
307
 
308
  @torch.no_grad()
309
+ def worker(input_image, image_position, prompts, n_prompt, seed, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number):
 
310
  def encode_prompt(prompt, n_prompt):
311
  llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
312
 
 
324
  clip_l_pooler_n = clip_l_pooler_n.to(transformer.dtype)
325
  return [llama_vec, clip_l_pooler, llama_vec_n, clip_l_pooler_n, llama_attention_mask, llama_attention_mask_n]
326
 
327
+ total_latent_sections = (total_second_length * fps_number) / (latent_window_size * 4)
328
  total_latent_sections = int(max(round(total_latent_sections), 1))
329
 
330
+ first_section_index = max(min(math.floor(image_position * (total_latent_sections - 1) / 100), (total_latent_sections - 1)), 0)
331
+ section_index = first_section_index
332
+ forward = (image_position == 0)
333
+
334
  job_id = generate_timestamp()
335
 
336
  stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
 
352
 
353
  prompt_parameters = []
354
 
355
+ for prompt_part in prompts[:total_latent_sections]:
356
  prompt_parameters.append(encode_prompt(prompt_part, n_prompt))
357
 
358
+ # Clean GPU
359
+ if not high_vram:
360
+ unload_complete_models(
361
+ text_encoder, text_encoder_2
362
+ )
363
+
364
  # Processing input image
365
 
366
  stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Image processing ...'))))
 
412
  start_latent = start_latent.to(history_latents)
413
  history_pixels = None
414
 
415
+ history_latents = torch.cat([history_latents, start_latent] if forward else [start_latent, history_latents], dim=2)
416
  total_generated_latent_frames = 1
417
 
418
  if enable_preview:
 
430
  current_step = d['i'] + 1
431
  percentage = int(100.0 * current_step / steps)
432
  hint = f'Sampling {current_step}/{steps}'
433
+ desc = f'Total generated frames: {int(max(0, total_generated_latent_frames * 4 - 3))}, Video length: {max(0, (total_generated_latent_frames * 4 - 3) / fps_number) :.2f} seconds (FPS-30), Resolution: {height}px * {width}px. The video is being extended now ...'
434
  stream.output_queue.push(('progress', (preview, desc, make_progress_bar_html(percentage, hint))))
435
  return
436
  else:
437
  def callback(d):
438
  return
439
 
440
+ indices = torch.arange(0, 1 + 16 + 2 + 1 + latent_window_size).unsqueeze(0)
441
+ if forward:
 
 
 
442
  clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices = indices.split([1, 16, 2, 1, latent_window_size], dim=1)
443
  clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
444
+ else:
445
+ latent_indices, clean_latent_1x_indices, clean_latent_2x_indices, clean_latent_4x_indices, clean_latent_indices_start = indices.split([latent_window_size, 1, 2, 16, 1], dim=1)
446
+ clean_latent_indices = torch.cat([clean_latent_1x_indices, clean_latent_indices_start], dim=1)
447
 
448
+ def post_process(forward, generated_latents, total_generated_latent_frames, history_latents, high_vram, transformer, gpu, vae, history_pixels, latent_window_size, enable_preview, section_index, total_latent_sections, outputs_folder, mp4_crf, stream):
449
  total_generated_latent_frames += int(generated_latents.shape[2])
450
+ history_latents = torch.cat([history_latents, generated_latents.to(history_latents)] if forward else [generated_latents.to(history_latents), history_latents], dim=2)
451
 
452
  if not high_vram:
453
  offload_model_from_device_for_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=8)
454
  load_model_as_complete(vae, target_device=gpu)
455
 
456
  if history_pixels is None:
457
+ real_history_latents = history_latents[:, :, -total_generated_latent_frames:, :, :] if forward else history_latents[:, :, :total_generated_latent_frames, :, :]
458
  history_pixels = vae_decode(real_history_latents, vae).cpu()
459
  else:
460
  section_latent_frames = latent_window_size * 2
461
  overlapped_frames = latent_window_size * 4 - 3
462
 
463
+ if forward:
 
 
 
464
  real_history_latents = history_latents[:, :, -min(section_latent_frames, total_generated_latent_frames):, :, :]
465
  history_pixels = soft_append_bcthw(history_pixels, vae_decode(real_history_latents, vae).cpu(), overlapped_frames)
466
+ else:
467
+ real_history_latents = history_latents[:, :, :min(section_latent_frames, total_generated_latent_frames), :, :]
468
+ history_pixels = soft_append_bcthw(vae_decode(real_history_latents, vae).cpu(), history_pixels, overlapped_frames)
469
 
470
  if not high_vram:
471
  unload_complete_models()
472
 
473
+ if enable_preview or section_index == (0 if first_section_index == (total_latent_sections - 1) else (total_latent_sections - 1)):
474
  output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
475
 
476
+ save_bcthw_as_mp4(history_pixels, output_filename, fps=fps_number, crf=mp4_crf)
477
 
478
  print(f'Decoded. Current latent shape pixel shape {history_pixels.shape}')
479
 
480
  stream.output_queue.push(('file', output_filename))
481
  return [total_generated_latent_frames, history_latents, history_pixels]
482
+
483
+ while section_index < total_latent_sections:
484
  if stream.input_queue.top() == 'end':
485
  stream.output_queue.push(('end', None))
486
  return
487
 
488
  print(f'section_index = {section_index}, total_latent_sections = {total_latent_sections}')
489
 
490
+ prompt_index = min(section_index, len(prompt_parameters) - 1)
491
+
492
+ [llama_vec, clip_l_pooler, llama_vec_n, clip_l_pooler_n, llama_attention_mask, llama_attention_mask_n] = prompt_parameters[prompt_index]
493
+
494
+ if prompt_index < len(prompt_parameters) - 1 or (prompt_index == total_latent_sections - 1):
495
+ prompt_parameters[prompt_index] = None
496
 
497
  if not high_vram:
498
  unload_complete_models()
 
503
  else:
504
  transformer.initialize_teacache(enable_teacache=False)
505
 
506
+ if forward:
507
+ clean_latents_4x, clean_latents_2x, clean_latents_1x = history_latents[:, :, -(16 + 2 + 1):, :, :].split([16, 2, 1], dim=2)
 
 
 
508
  clean_latents = torch.cat([start_latent, clean_latents_1x], dim=2)
509
+ else:
510
+ clean_latents_1x, clean_latents_2x, clean_latents_4x = history_latents[:, :, :(1 + 2 + 16), :, :].split([1, 2, 16], dim=2)
511
+ clean_latents = torch.cat([clean_latents_1x, start_latent], dim=2)
512
 
513
  generated_latents = sample_hunyuan(
514
  transformer=transformer,
 
541
  callback=callback,
542
  )
543
 
544
+ [total_generated_latent_frames, history_latents, history_pixels] = post_process(forward, generated_latents, total_generated_latent_frames, history_latents, high_vram, transformer, gpu, vae, history_pixels, latent_window_size, enable_preview, section_index, total_latent_sections, outputs_folder, mp4_crf, stream)
545
+
546
+ if not forward:
547
+ if section_index > 0:
548
+ section_index -= 1
549
+ else:
550
+ clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices = indices.split([1, 16, 2, 1, latent_window_size], dim=1)
551
+ clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
552
+
553
+ real_history_latents = history_latents[:, :, :total_generated_latent_frames, :, :]
554
+ zero_latents = history_latents[:, :, total_generated_latent_frames:, :, :]
555
+ history_latents = torch.cat([zero_latents, real_history_latents], dim=2)
556
+ real_history_latents = zero_latents = None
557
+
558
+ forward = True
559
+ section_index = first_section_index
560
+
561
+ if forward:
562
+ section_index += 1
563
  except:
564
  traceback.print_exc()
565
 
 
594
  stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
595
 
596
  try:
597
+ # 20250506 pftq: Processing input video instead of image
598
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Video processing ...'))))
599
+
600
+ # 20250506 pftq: Encode video
601
+ start_latent, input_image_np, video_latents, fps, height, width = video_encode(input_video, resolution, no_resize, vae, vae_batch_size=vae_batch, device=gpu)[:6]
602
+ start_latent = start_latent.to(dtype=torch.float32).cpu()
603
+ video_latents = video_latents.cpu()
604
+
605
+ total_latent_sections = (total_second_length * fps) / (latent_window_size * 4)
606
+ total_latent_sections = int(max(round(total_latent_sections), 1))
607
+
608
  # Clean GPU
609
  if not high_vram:
610
  unload_complete_models(
 
620
 
621
  prompt_parameters = []
622
 
623
+ for prompt_part in prompts[:total_latent_sections]:
624
  prompt_parameters.append(encode_prompt(prompt_part, n_prompt))
625
 
626
+ # Clean GPU
627
+ if not high_vram:
628
+ unload_complete_models(
629
+ text_encoder, text_encoder_2
630
+ )
 
 
631
 
632
  # CLIP Vision
633
  stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'CLIP Vision encoding ...'))))
 
641
  # Dtype
642
  image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
643
 
 
 
 
644
  if enable_preview:
645
  def callback(d):
646
  preview = d['denoised']
 
677
  total_context_frames = num_4x_frames + num_2x_frames + effective_clean_frames
678
  total_context_frames = min(total_context_frames, available_frames) # 20250507 pftq: Edge case for <=1 sec videos
679
 
680
+ indices = torch.arange(0, 1 + num_4x_frames + num_2x_frames + effective_clean_frames + adjusted_latent_frames).unsqueeze(0) # 20250507 pftq: latent_window_size to adjusted_latent_frames for edge case for <=1 sec videos
681
  clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices = indices.split(
682
  [1, num_4x_frames, num_2x_frames, effective_clean_frames, adjusted_latent_frames], dim=1 # 20250507 pftq: latent_window_size to adjusted_latent_frames for edge case for <=1 sec videos
683
  )
 
846
  stream.output_queue.push(('end', None))
847
  return
848
 
849
+ def get_duration(input_image, image_position, prompts, generation_mode, n_prompt, seed, resolution, total_second_length, allocation_time, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number):
850
+ return allocation_time
851
 
852
  # Remove this decorator if you run on local
853
  @spaces.GPU(duration=get_duration)
854
+ def process_on_gpu(input_image, image_position, prompts, generation_mode, n_prompt, seed, resolution, total_second_length, allocation_time, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
855
  ):
856
  start = time.time()
857
  global stream
858
  stream = AsyncStream()
859
 
860
+ async_run(worker, input_image, image_position, prompts, n_prompt, seed, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number)
861
 
862
  output_filename = None
863
 
 
866
 
867
  if flag == 'file':
868
  output_filename = data
869
+ yield gr.update(value=output_filename, label="Previewed Frames"), gr.skip(), gr.skip(), gr.skip(), gr.update(interactive=False), gr.update(interactive=True), gr.skip()
870
 
871
  if flag == 'progress':
872
  preview, desc, html = data
873
+ yield gr.update(label="Previewed Frames"), gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True), gr.skip()
874
 
875
  if flag == 'end':
876
  end = time.time()
 
879
  secondes = secondes - (minutes * 60)
880
  hours = math.floor(minutes / 60)
881
  minutes = minutes - (hours * 60)
882
+ yield gr.update(value=output_filename, label="Finished Frames"), gr.update(visible=False), gr.skip(), "The process has lasted " + \
883
  ((str(hours) + " h, ") if hours != 0 else "") + \
884
  ((str(minutes) + " min, ") if hours != 0 or minutes != 0 else "") + \
885
  str(secondes) + " sec. " + \
 
893
  n_prompt="",
894
  randomize_seed=True,
895
  seed=31337,
896
+ auto_allocation=True,
897
+ allocation_time=180,
898
  resolution=640,
899
  total_second_length=5,
900
  latent_window_size=9,
 
905
  gpu_memory_preservation=6,
906
  enable_preview=True,
907
  use_teacache=False,
908
+ mp4_crf=16,
909
+ fps_number=30
910
  ):
911
+ if auto_allocation:
912
+ allocation_time = min(total_second_length * 60 * (0.9 if use_teacache else 3.0) * (1 + ((steps - 25) / 25)), 600)
913
 
914
  if torch.cuda.device_count() == 0:
915
  gr.Warning('Set this space to GPU config to make it work.')
916
+ yield gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.update(visible = False)
917
  return
918
 
919
  if randomize_seed:
 
927
  input_image = np.ones((default_height, default_width, 3), dtype=np.uint8) * 255
928
  print("No input image provided. Using a blank white image.")
929
 
930
+ yield gr.update(label="Previewed Frames"), None, '', '', gr.update(interactive=False), gr.update(interactive=True), gr.skip()
931
 
932
  yield from process_on_gpu(input_image,
933
  image_position,
 
937
  seed,
938
  resolution,
939
  total_second_length,
940
+ allocation_time,
941
  latent_window_size,
942
  steps,
943
  cfg,
 
946
  gpu_memory_preservation,
947
  enable_preview,
948
  use_teacache,
949
+ mp4_crf,
950
+ fps_number
951
  )
952
 
953
+ def get_duration_video(input_video, prompts, n_prompt, seed, batch, resolution, total_second_length, allocation_time, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
954
+ return allocation_time
955
 
956
  # Remove this decorator if you run on local
957
  @spaces.GPU(duration=get_duration_video)
958
+ def process_video_on_gpu(input_video, prompts, n_prompt, seed, batch, resolution, total_second_length, allocation_time, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
959
  start = time.time()
960
  global stream
961
  stream = AsyncStream()
 
970
 
971
  if flag == 'file':
972
  output_filename = data
973
+ yield gr.update(value=output_filename, label="Previewed Frames"), gr.skip(), gr.skip(), gr.skip(), gr.update(interactive=False), gr.update(interactive=True), gr.skip()
974
 
975
  if flag == 'progress':
976
  preview, desc, html = data
977
+ yield gr.update(label="Previewed Frames"), gr.update(visible=True, value=preview), desc, html, gr.update(interactive=False), gr.update(interactive=True), gr.skip() # 20250506 pftq: Keep refreshing the video in case it got hidden when the tab was in the background
978
 
979
  if flag == 'end':
980
  end = time.time()
 
991
  " You can upscale the result with RIFE. To make all your generated scenes consistent, you can then apply a face swap on the main character. If you do not see the generated video above, the process may have failed. See the logs for more information. If you see an error like ''NVML_SUCCESS == r INTERNAL ASSERT FAILED'', you probably haven't enough VRAM. Test an example or other options to compare. You can share your inputs to the original space or set your space in public for a peer review.", '', gr.update(interactive=True), gr.update(interactive=False), gr.update(visible = False)
992
  break
993
 
994
+ def process_video(input_video, prompt, n_prompt, randomize_seed, seed, auto_allocation, allocation_time, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
995
  global high_vram
996
+ if auto_allocation:
997
+ allocation_time = min(total_second_length * 60 * (1.5 if use_teacache else 3.0) * (1 + ((steps - 25) / 25)), 600)
998
 
999
  if torch.cuda.device_count() == 0:
1000
  gr.Warning('Set this space to GPU config to make it work.')
1001
+ yield gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.skip(), gr.update(visible = False)
1002
  return
1003
 
1004
  if randomize_seed:
 
1009
  # 20250506 pftq: Updated assertion for video input
1010
  assert input_video is not None, 'No input video!'
1011
 
1012
+ yield gr.update(label="Previewed Frames"), None, '', '', gr.update(interactive=False), gr.update(interactive=True), gr.skip()
1013
 
1014
  # 20250507 pftq: Even the H100 needs offloading if the video dimensions are 720p or higher
1015
  if high_vram and (no_resize or resolution>640):
 
1024
  if cfg > 1:
1025
  gs = 1
1026
 
1027
+ yield from process_video_on_gpu(input_video, prompts, n_prompt, seed, batch, resolution, total_second_length, allocation_time, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch)
1028
 
1029
  def end_process():
1030
  stream.input_queue.push('end')
 
1095
  with gr.Row():
1096
  with gr.Column():
1097
  generation_mode = gr.Radio([["Text-to-Video", "text"], ["Image-to-Video", "image"], ["Video Extension", "video"]], elem_id="generation-mode", label="Generation mode", value = "image")
1098
+ text_to_video_hint = gr.HTML("Text-to-Video badly works. I discourage to use the Text-to-Video feature. You should rather generate an image with Flux and use Image-to-Video. You will save time.")
1099
  input_image = gr.Image(sources='upload', type="numpy", label="Image", height=320)
1100
+ image_position = gr.Slider(label="Image position", minimum=0, maximum=100, value=0, step=1, info='0=Video start; 100=Video end (lower quality)')
1101
  input_video = gr.Video(sources='upload', label="Input Video", height=320)
1102
  timeless_prompt = gr.Textbox(label="Timeless prompt", info='Used on the whole duration of the generation', value='', placeholder="The creature starts to move, fast motion, fixed camera, focus motion, consistent arm, consistent position, mute colors, insanely detailed")
1103
  prompt_number = gr.Slider(label="Timed prompt number", minimum=0, maximum=1000, value=0, step=1, info='Prompts will automatically appear')
 
1111
 
1112
  final_prompt = gr.Textbox(label="Final prompt", value='', info='Use ; to separate in time; beware to write to stop the previous action')
1113
  prompt_hint = gr.HTML("Video extension barely follows the prompt; to force to follow the prompt, you have to set the Distilled CFG Scale to 3.0 and the Context Frames to 2 but the video quality will be poor.")
1114
+ total_second_length = gr.Slider(label="Video length to generate (seconds if 30 fps)", minimum=1, maximum=120, value=2, step=0.1)
1115
 
1116
  with gr.Row():
1117
  start_button = gr.Button(value="🎥 Generate", variant="primary")
 
1124
 
1125
  n_prompt = gr.Textbox(label="Negative Prompt", value="Missing arm, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", info='Requires using normal CFG (undistilled) instead of Distilled (set Distilled=1 and CFG > 1).')
1126
 
1127
+ fps_number = gr.Slider(label="Frame per seconds", info="The model is trained for 30 fps so other fps may generate weird results", minimum=10, maximum=60, value=30, step=1)
1128
+
1129
  latent_window_size = gr.Slider(label="Latent Window Size", minimum=1, maximum=33, value=9, step=1, info='Generate more frames at a time (larger chunks). Less degradation and better blending but higher VRAM cost. Should not change.')
1130
  steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=30, step=1, info='Increase for more quality, especially if using high non-distilled CFG. If your animation has very few motion, you may have brutal brightness change; this can be fixed increasing the steps.')
1131
 
 
1166
  with gr.Row():
1167
  randomize_seed = gr.Checkbox(label='Randomize seed', value=True, info='If checked, the seed is always different')
1168
  seed = gr.Slider(label="Seed", minimum=0, maximum=np.iinfo(np.int32).max, step=1, randomize=True)
1169
+ with gr.Row():
1170
+ auto_allocation = gr.Checkbox(label='Auto allocation', value=True, info='If checked, the GPU allocation time is estimated from the parameters')
1171
+ allocation_time = gr.Slider(label="GPU allocation time (in seconds)", info='lower=May abort run, higher=Quota penalty for next runs; only useful for ZeroGPU; for instance set to 88 when you have the message "You have exceeded your GPU quota (180s requested vs. 89s left)."', value=180, minimum=60, maximum=320, step=1)
1172
 
1173
  with gr.Column():
1174
  warning = gr.HTML(value = "<center><big>Your computer must <u>not</u> enter into standby mode.</big><br/>On Chrome, you can force to keep a tab alive in <code>chrome://discards/</code></center>", visible = False)
 
1178
  progress_bar = gr.HTML('', elem_classes='no-generating-animation')
1179
 
1180
  # 20250506 pftq: Updated inputs to include num_clean_frames
1181
+ ips = [input_image, image_position, final_prompt, generation_mode, n_prompt, randomize_seed, seed, auto_allocation, allocation_time, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number]
1182
+ ips_video = [input_video, final_prompt, n_prompt, randomize_seed, seed, auto_allocation, allocation_time, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch]
1183
 
1184
  gr.Examples(
1185
+ label = "✍️ Examples from text",
1186
  examples = [
1187
  [
1188
  None, # input_image
 
1192
  "Missing arm, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
1193
  True, # randomize_seed
1194
  42, # seed
1195
+ True, # auto_allocation
1196
+ 180, # allocation_time
1197
  672, # resolution
1198
  1, # total_second_length
1199
  9, # latent_window_size
 
1204
  6, # gpu_memory_preservation
1205
  False, # enable_preview
1206
  False, # use_teacache
1207
+ 16, # mp4_crf
1208
+ 30 # fps_number
1209
  ]
1210
  ],
1211
  run_on_click = True,
1212
  fn = process,
1213
  inputs = ips,
1214
+ outputs = [result_video, preview_image, progress_desc, progress_bar, start_button, end_button, warning],
1215
  cache_examples = False,
1216
  )
1217
 
1218
  gr.Examples(
1219
+ label = "🖼️ Examples from image",
1220
  examples = [
1221
  [
1222
  "./img_examples/Example1.png", # input_image
 
1226
  "Missing arm, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
1227
  True, # randomize_seed
1228
  42, # seed
1229
+ True, # auto_allocation
1230
+ 180, # allocation_time
1231
  672, # resolution
1232
  1, # total_second_length
1233
  9, # latent_window_size
 
1238
  6, # gpu_memory_preservation
1239
  False, # enable_preview
1240
  True, # use_teacache
1241
+ 16, # mp4_crf
1242
+ 30 # fps_number
1243
  ],
1244
  [
1245
  "./img_examples/Example2.webp", # input_image
 
1249
  "Missing arm, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
1250
  True, # randomize_seed
1251
  42, # seed
1252
+ True, # auto_allocation
1253
+ 180, # allocation_time
1254
  672, # resolution
1255
  2, # total_second_length
1256
  9, # latent_window_size
 
1261
  6, # gpu_memory_preservation
1262
  False, # enable_preview
1263
  True, # use_teacache
1264
+ 16, # mp4_crf
1265
+ 30 # fps_number
1266
  ],
1267
  [
1268
  "./img_examples/Example2.webp", # input_image
 
1272
  "Missing arm, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
1273
  True, # randomize_seed
1274
  42, # seed
1275
+ True, # auto_allocation
1276
+ 180, # allocation_time
1277
  672, # resolution
1278
  2, # total_second_length
1279
  9, # latent_window_size
 
1284
  6, # gpu_memory_preservation
1285
  False, # enable_preview
1286
  True, # use_teacache
1287
+ 16, # mp4_crf
1288
+ 30 # fps_number
1289
  ],
1290
  [
1291
  "./img_examples/Example3.jpg", # input_image
 
1295
  "Missing arm, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
1296
  True, # randomize_seed
1297
  42, # seed
1298
+ True, # auto_allocation
1299
+ 180, # allocation_time
1300
  672, # resolution
1301
  1, # total_second_length
1302
  9, # latent_window_size
 
1307
  6, # gpu_memory_preservation
1308
  False, # enable_preview
1309
  True, # use_teacache
1310
+ 16, # mp4_crf
1311
+ 30 # fps_number
1312
  ],
1313
  [
1314
  "./img_examples/Example4.webp", # input_image
 
1318
  "Missing arm, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
1319
  True, # randomize_seed
1320
  42, # seed
1321
+ True, # auto_allocation
1322
+ 180, # allocation_time
1323
  672, # resolution
1324
  1, # total_second_length
1325
  9, # latent_window_size
 
1330
  6, # gpu_memory_preservation
1331
  False, # enable_preview
1332
  False, # use_teacache
1333
+ 16, # mp4_crf
1334
+ 30 # fps_number
1335
  ]
1336
  ],
1337
  run_on_click = True,
1338
  fn = process,
1339
  inputs = ips,
1340
+ outputs = [result_video, preview_image, progress_desc, progress_bar, start_button, end_button, warning],
1341
  cache_examples = False,
1342
  )
1343
 
1344
  gr.Examples(
1345
+ label = "🎥 Examples from video",
1346
  examples = [
1347
  [
1348
  "./img_examples/Example1.mp4", # input_video
 
1350
  "Missing arm, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
1351
  True, # randomize_seed
1352
  42, # seed
1353
+ True, # auto_allocation
1354
+ 180, # allocation_time
1355
  1, # batch
1356
  672, # resolution
1357
  1, # total_second_length
 
1372
  run_on_click = True,
1373
  fn = process_video,
1374
  inputs = ips_video,
1375
+ outputs = [result_video, preview_image, progress_desc, progress_bar, start_button_video, end_button, warning],
1376
  cache_examples = False,
1377
  )
1378
 
 
1398
 
1399
  def handle_generation_mode_change(generation_mode_data):
1400
  if generation_mode_data == "text":
1401
+ return [gr.update(visible = True), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = True), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = True)]
1402
  elif generation_mode_data == "image":
1403
+ return [gr.update(visible = False), gr.update(visible = True), gr.update(visible = True), gr.update(visible = False), gr.update(visible = True), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = True)]
1404
  elif generation_mode_data == "video":
1405
+ return [gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = True), gr.update(visible = False), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True), gr.update(visible = False)]
1406
 
1407
  prompt_number.change(fn=handle_prompt_number_change, inputs=[], outputs=[])
1408
  timeless_prompt.change(fn=handle_timeless_prompt_change, inputs=[timeless_prompt], outputs=[final_prompt])
 
1424
  generation_mode.change(
1425
  fn=handle_generation_mode_change,
1426
  inputs=[generation_mode],
1427
+ outputs=[text_to_video_hint, image_position, input_image, input_video, start_button, start_button_video, no_resize, batch, num_clean_frames, vae_batch, prompt_hint, fps_number]
1428
  )
1429
 
1430
  # Update display when the page loads
 
1432
  fn=handle_generation_mode_change, inputs = [
1433
  generation_mode
1434
  ], outputs = [
1435
+ text_to_video_hint, image_position, input_image, input_video, start_button, start_button_video, no_resize, batch, num_clean_frames, vae_batch, prompt_hint, fps_number
1436
  ]
1437
  )
1438