Fabrice-TIERCELIN commited on
Commit
6c41ee0
·
verified ·
1 Parent(s): 66a6761

Video+end frame

Browse files
Files changed (1) hide show
  1. app.py +210 -45
app.py CHANGED
@@ -10,9 +10,7 @@ except:
10
  class spaces():
11
  def GPU(*args, **kwargs):
12
  def decorator(function):
13
- def new_function(*dummy_args, **dummy_kwargs):
14
- return function(*dummy_args, **dummy_kwargs)
15
- return new_function
16
  return decorator
17
 
18
  import gradio as gr
@@ -24,6 +22,7 @@ import numpy as np
24
  import random
25
  import time
26
  import math
 
27
  # 20250506 pftq: Added for video input loading
28
  import decord
29
  # 20250506 pftq: Added for progress bars in video_encode
@@ -309,8 +308,67 @@ def set_mp4_comments_imageio_ffmpeg(input_file, comments):
309
  print(f"Error saving prompt to video metadata, ffmpeg may be required: "+str(e))
310
  return False
311
 
 
312
  @torch.no_grad()
313
- def worker(input_image, end_image, image_position, prompts, n_prompt, seed, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
314
  def encode_prompt(prompt, n_prompt):
315
  llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
316
 
@@ -405,6 +463,8 @@ def worker(input_image, end_image, image_position, prompts, n_prompt, seed, reso
405
  return [start_latent, image_encoder_last_hidden_state]
406
 
407
  [start_latent, image_encoder_last_hidden_state] = get_start_latent(input_image, height, width, vae, gpu, image_encoder, high_vram)
 
 
408
 
409
  # Dtype
410
 
@@ -500,7 +560,7 @@ def worker(input_image, end_image, image_position, prompts, n_prompt, seed, reso
500
  [llama_vec, clip_l_pooler, llama_vec_n, clip_l_pooler_n, llama_attention_mask, llama_attention_mask_n] = prompt_parameters[prompt_index]
501
 
502
  if prompt_index < len(prompt_parameters) - 1 or (prompt_index == total_latent_sections - 1):
503
- prompt_parameters[prompt_index] = None
504
 
505
  if not high_vram:
506
  unload_complete_models()
@@ -548,6 +608,13 @@ def worker(input_image, end_image, image_position, prompts, n_prompt, seed, reso
548
  clean_latent_4x_indices=clean_latent_4x_indices,
549
  callback=callback,
550
  )
 
 
 
 
 
 
 
551
 
552
  [total_generated_latent_frames, history_latents, history_pixels] = post_process(forward, generated_latents, total_generated_latent_frames, history_latents, high_vram, transformer, gpu, vae, history_pixels, latent_window_size, enable_preview, section_index, total_latent_sections, outputs_folder, mp4_crf, stream)
553
 
@@ -561,7 +628,8 @@ def worker(input_image, end_image, image_position, prompts, n_prompt, seed, reso
561
  real_history_latents = history_latents[:, :, :total_generated_latent_frames, :, :]
562
  zero_latents = history_latents[:, :, total_generated_latent_frames:, :, :]
563
  history_latents = torch.cat([zero_latents, real_history_latents], dim=2)
564
- real_history_latents = zero_latents = None
 
565
 
566
  forward = True
567
  section_index = first_section_index
@@ -580,7 +648,7 @@ def worker(input_image, end_image, image_position, prompts, n_prompt, seed, reso
580
  return
581
 
582
  @torch.no_grad()
583
- def worker_start_end(input_image, end_image, image_position, prompts, n_prompt, seed, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number):
584
  def encode_prompt(prompt, n_prompt):
585
  llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
586
 
@@ -689,6 +757,8 @@ def worker_start_end(input_image, end_image, image_position, prompts, n_prompt,
689
  return [start_latent, end_latent, image_encoder_last_hidden_state]
690
 
691
  [start_latent, end_latent, image_encoder_last_hidden_state] = get_start_latent(input_image, has_end_image, end_image, height, width, vae, gpu, image_encoder, high_vram)
 
 
692
 
693
  # Dtype
694
  image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
@@ -708,7 +778,7 @@ def worker_start_end(input_image, end_image, image_position, prompts, n_prompt,
708
  total_generated_latent_frames = 0
709
 
710
  if total_latent_sections > 4:
711
- # In theory the latent_paddings should follow the above sequence, but it seems that duplicating some
712
  # items looks better than expanding it when total_latent_sections > 4
713
  # One can try to remove below trick and just
714
  # use `latent_paddings = list(reversed(range(total_latent_sections)))` to compare
@@ -787,15 +857,15 @@ def worker_start_end(input_image, end_image, image_position, prompts, n_prompt,
787
  if len(prompt_parameters) > 0:
788
  [llama_vec, clip_l_pooler, llama_vec_n, clip_l_pooler_n, llama_attention_mask, llama_attention_mask_n] = prompt_parameters.pop(len(prompt_parameters) - 1)
789
 
790
- indices = torch.arange(1 + latent_padding_size + latent_window_size + 1 + 2 + 16).unsqueeze(0)
791
- clean_latent_indices_pre, blank_indices, latent_indices, clean_latent_indices_post, clean_latent_2x_indices, clean_latent_4x_indices = indices.split([1, latent_padding_size, latent_window_size, 1, 2, 16], dim=1)
792
  clean_latent_indices = torch.cat([clean_latent_indices_pre, clean_latent_indices_post], dim=1)
793
 
794
  clean_latents_post, clean_latents_2x, clean_latents_4x = history_latents[:, :, :1 + 2 + 16, :, :].split([1, 2, 16], dim=2)
795
 
796
  # Use end image latent for the first section if provided
797
  if has_end_image and is_first_section:
798
- clean_latents_post = end_latent
799
 
800
  clean_latents = torch.cat([start_latent, clean_latents_post], dim=2)
801
 
@@ -838,6 +908,13 @@ def worker_start_end(input_image, end_image, image_position, prompts, n_prompt,
838
  clean_latent_4x_indices=clean_latent_4x_indices,
839
  callback=callback,
840
  )
 
 
 
 
 
 
 
841
 
842
  [total_generated_latent_frames, history_latents, history_pixels] = post_process(job_id, start_latent, generated_latents, total_generated_latent_frames, history_latents, high_vram, transformer, gpu, vae, history_pixels, latent_window_size, enable_preview, outputs_folder, mp4_crf, stream, is_last_section)
843
 
@@ -856,7 +933,7 @@ def worker_start_end(input_image, end_image, image_position, prompts, n_prompt,
856
 
857
  # 20250506 pftq: Modified worker to accept video input and clean frame count
858
  @torch.no_grad()
859
- def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
860
  def encode_prompt(prompt, n_prompt):
861
  llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
862
 
@@ -882,6 +959,7 @@ def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_
882
 
883
  # 20250506 pftq: Encode video
884
  start_latent, input_image_np, video_latents, fps, height, width = video_encode(input_video, resolution, no_resize, vae, vae_batch_size=vae_batch, device=gpu)
 
885
  start_latent = start_latent.to(dtype=torch.float32, device=cpu)
886
  video_latents = video_latents.cpu()
887
 
@@ -919,12 +997,29 @@ def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_
919
  load_model_as_complete(image_encoder, target_device=gpu)
920
 
921
  image_encoder_output = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
922
 
923
  # Clean GPU
924
  if not high_vram:
925
- unload_complete_models(image_encoder)
926
 
927
  image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
 
928
 
929
  # Dtype
930
  image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
@@ -951,7 +1046,13 @@ def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_
951
  def callback(d):
952
  return
953
 
954
- def compute_latent(history_latents, latent_window_size, num_clean_frames, start_latent):
 
 
 
 
 
 
955
  # 20250506 pftq: Use user-specified number of context frames, matching original allocation for num_clean_frames=2
956
  available_frames = history_latents.shape[2] # Number of latent frames
957
  max_pixel_frames = min(latent_window_size * 4 - 3, available_frames * 4) # Cap at available pixel frames
@@ -965,11 +1066,12 @@ def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_
965
  total_context_frames = num_4x_frames + num_2x_frames + effective_clean_frames
966
  total_context_frames = min(total_context_frames, available_frames) # 20250507 pftq: Edge case for <=1 sec videos
967
 
968
- indices = torch.arange(0, 1 + num_4x_frames + num_2x_frames + effective_clean_frames + adjusted_latent_frames).unsqueeze(0) # 20250507 pftq: latent_window_size to adjusted_latent_frames for edge case for <=1 sec videos
969
- clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices = indices.split(
970
- [1, num_4x_frames, num_2x_frames, effective_clean_frames, adjusted_latent_frames], dim=1 # 20250507 pftq: latent_window_size to adjusted_latent_frames for edge case for <=1 sec videos
 
971
  )
972
- clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices], dim=1)
973
 
974
  # 20250506 pftq: Split history_latents dynamically based on available frames
975
  fallback_frame_count = 2 # 20250507 pftq: Changed 0 to 2 Edge case for <=1 sec videos
@@ -1002,7 +1104,10 @@ def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_
1002
  if effective_clean_frames > 0 and split_idx < len(splits):
1003
  clean_latents_1x = splits[split_idx]
1004
 
1005
- clean_latents = torch.cat([start_latent, clean_latents_1x], dim=2)
 
 
 
1006
 
1007
  # 20250507 pftq: Fix for <=1 sec videos.
1008
  max_frames = min(latent_window_size * 4 - 3, history_latents.shape[2] * 4)
@@ -1024,10 +1129,18 @@ def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_
1024
  history_latents = video_latents
1025
  total_generated_latent_frames = history_latents.shape[2]
1026
  # 20250506 pftq: Initialize history_pixels to fix UnboundLocalError
1027
- history_pixels = None
1028
- previous_video = None
 
 
 
 
 
1029
 
1030
- for section_index in range(total_latent_sections):
 
 
 
1031
  if stream.input_queue.top() == 'end':
1032
  stream.output_queue.push(('end', None))
1033
  return
@@ -1046,7 +1159,7 @@ def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_
1046
  else:
1047
  transformer.initialize_teacache(enable_teacache=False)
1048
 
1049
- [max_frames, clean_latents, clean_latents_2x, clean_latents_4x, latent_indices, clean_latents, clean_latent_indices, clean_latent_2x_indices, clean_latent_4x_indices] = compute_latent(history_latents, latent_window_size, num_clean_frames, start_latent)
1050
 
1051
  generated_latents = sample_hunyuan(
1052
  transformer=transformer,
@@ -1077,6 +1190,13 @@ def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_
1077
  clean_latent_4x_indices=clean_latent_4x_indices,
1078
  callback=callback,
1079
  )
 
 
 
 
 
 
 
1080
 
1081
  total_generated_latent_frames += int(generated_latents.shape[2])
1082
  history_latents = torch.cat([history_latents, generated_latents.to(history_latents)], dim=2)
@@ -1134,17 +1254,17 @@ def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_
1134
  stream.output_queue.push(('end', None))
1135
  return
1136
 
1137
- def get_duration(input_image, end_image, image_position, prompts, generation_mode, n_prompt, seed, resolution, total_second_length, allocation_time, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number):
1138
  return allocation_time
1139
 
1140
  @spaces.GPU(duration=get_duration)
1141
- def process_on_gpu(input_image, end_image, image_position, prompts, generation_mode, n_prompt, seed, resolution, total_second_length, allocation_time, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number
1142
  ):
1143
  start = time.time()
1144
  global stream
1145
  stream = AsyncStream()
1146
 
1147
- async_run(worker_start_end if generation_mode == "start_end" else worker, input_image, end_image, image_position, prompts, n_prompt, seed, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number)
1148
 
1149
  output_filename = None
1150
 
@@ -1170,12 +1290,13 @@ def process_on_gpu(input_image, end_image, image_position, prompts, generation_m
1170
  ((str(hours) + " h, ") if hours != 0 else "") + \
1171
  ((str(minutes) + " min, ") if hours != 0 or minutes != 0 else "") + \
1172
  str(secondes) + " sec. " + \
1173
- "You can upscale the result with RIFE. To make all your generated scenes consistent, you can then apply a face swap on the main character. If you do not see the generated video above, the process may have failed. See the logs for more information. If you see an error like ''NVML_SUCCESS == r INTERNAL ASSERT FAILED'', you probably haven't enough VRAM. Test an example or other options to compare. You can share your inputs to the original space or set your space in public for a peer review.", gr.update(interactive=True), gr.update(interactive=False), gr.update(visible = False)
1174
  break
1175
 
1176
  def process(input_image,
1177
  end_image,
1178
  image_position=0,
 
1179
  prompt="",
1180
  generation_mode="image",
1181
  n_prompt="",
@@ -1209,17 +1330,20 @@ def process(input_image,
1209
 
1210
  prompts = prompt.split(";")
1211
 
1212
- # assert input_image is not None, 'No input image!'
1213
  if generation_mode == "text":
1214
- default_height, default_width = 640, 640
1215
  input_image = np.ones((default_height, default_width, 3), dtype=np.uint8) * 255
1216
  print("No input image provided. Using a blank white image.")
 
 
1217
 
1218
  yield gr.update(label="Previewed Frames"), None, '', '', gr.update(interactive=False), gr.update(interactive=True), gr.skip()
1219
 
 
1220
  yield from process_on_gpu(input_image,
1221
  end_image,
1222
  image_position,
 
1223
  prompts,
1224
  generation_mode,
1225
  n_prompt,
@@ -1239,17 +1363,17 @@ def process(input_image,
1239
  fps_number
1240
  )
1241
 
1242
- def get_duration_video(input_video, prompts, n_prompt, seed, batch, resolution, total_second_length, allocation_time, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
1243
  return allocation_time
1244
 
1245
  @spaces.GPU(duration=get_duration_video)
1246
- def process_video_on_gpu(input_video, prompts, n_prompt, seed, batch, resolution, total_second_length, allocation_time, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
1247
  start = time.time()
1248
  global stream
1249
  stream = AsyncStream()
1250
 
1251
  # 20250506 pftq: Pass num_clean_frames, vae_batch, etc
1252
- async_run(worker_video, input_video, prompts, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch)
1253
 
1254
  output_filename = None
1255
 
@@ -1276,10 +1400,10 @@ def process_video_on_gpu(input_video, prompts, n_prompt, seed, batch, resolution
1276
  ((str(hours) + " h, ") if hours != 0 else "") + \
1277
  ((str(minutes) + " min, ") if hours != 0 or minutes != 0 else "") + \
1278
  str(secondes) + " sec. " + \
1279
- " You can upscale the result with RIFE. To make all your generated scenes consistent, you can then apply a face swap on the main character. If you do not see the generated video above, the process may have failed. See the logs for more information. If you see an error like ''NVML_SUCCESS == r INTERNAL ASSERT FAILED'', you probably haven't enough VRAM. Test an example or other options to compare. You can share your inputs to the original space or set your space in public for a peer review.", '', gr.update(interactive=True), gr.update(interactive=False), gr.update(visible = False)
1280
  break
1281
 
1282
- def process_video(input_video, prompt, n_prompt, randomize_seed, seed, auto_allocation, allocation_time, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
1283
  global high_vram
1284
  if auto_allocation:
1285
  allocation_time = min(total_second_length * 60 * (2.5 if use_teacache else 3.5) * (1 + ((steps - 25) / 25))**2, 600)
@@ -1312,7 +1436,8 @@ def process_video(input_video, prompt, n_prompt, randomize_seed, seed, auto_allo
1312
  if cfg > 1:
1313
  gs = 1
1314
 
1315
- yield from process_video_on_gpu(input_video, prompts, n_prompt, seed, batch, resolution, total_second_length, allocation_time, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch)
 
1316
 
1317
  def end_process():
1318
  stream.input_queue.push('end')
@@ -1382,12 +1507,12 @@ with block:
1382
  local_storage = gr.BrowserState(default_local_storage)
1383
  with gr.Row():
1384
  with gr.Column():
1385
- generation_mode = gr.Radio([["Text-to-Video", "text"], ["Image-to-Video", "image"], ["Start & end frames", "start_end"], ["Video Extension", "video"]], elem_id="generation-mode", label="Generation mode", value = "image")
1386
  text_to_video_hint = gr.HTML("Text-to-Video badly works with a flash effect at the start. I discourage to use the Text-to-Video feature. You should rather generate an image with Flux and use Image-to-Video. You will save time.")
1387
  input_image = gr.Image(sources='upload', type="numpy", label="Image", height=320)
1388
- end_image = gr.Image(sources='upload', type="numpy", label="End Frame (Optional)", height=320)
1389
  image_position = gr.Slider(label="Image position", minimum=0, maximum=100, value=0, step=1, info='0=Video start; 100=Video end (lower quality)')
1390
  input_video = gr.Video(sources='upload', label="Input Video", height=320)
 
1391
  timeless_prompt = gr.Textbox(label="Timeless prompt", info='Used on the whole duration of the generation', value='', placeholder="The creature starts to move, fast motion, fixed camera, focus motion, consistent arm, consistent position, mute colors, insanely detailed")
1392
  prompt_number = gr.Slider(label="Timed prompt number", minimum=0, maximum=1000, value=0, step=1, info='Prompts will automatically appear')
1393
 
@@ -1414,6 +1539,7 @@ with block:
1414
  n_prompt = gr.Textbox(label="Negative Prompt", value="Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth", info='Requires using normal CFG (undistilled) instead of Distilled (set Distilled=1 and CFG > 1).')
1415
 
1416
  fps_number = gr.Slider(label="Frame per seconds", info="The model is trained for 30 fps so other fps may generate weird results", minimum=10, maximum=60, value=30, step=1)
 
1417
 
1418
  latent_window_size = gr.Slider(label="Latent Window Size", minimum=1, maximum=33, value=9, step=1, info='Generate more frames at a time (larger chunks). Less degradation and better blending but higher VRAM cost. Should not change.')
1419
  steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=30, step=1, info='Increase for more quality, especially if using high non-distilled CFG. If your animation has very few motion, you may have brutal brightness change; this can be fixed increasing the steps.')
@@ -1466,8 +1592,8 @@ with block:
1466
  progress_desc = gr.Markdown('', elem_classes='no-generating-animation')
1467
  progress_bar = gr.HTML('', elem_classes='no-generating-animation')
1468
 
1469
- ips = [input_image, end_image, image_position, final_prompt, generation_mode, n_prompt, randomize_seed, seed, auto_allocation, allocation_time, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number]
1470
- ips_video = [input_video, final_prompt, n_prompt, randomize_seed, seed, auto_allocation, allocation_time, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch]
1471
 
1472
  gr.Examples(
1473
  label = "✍️ Examples from text",
@@ -1476,6 +1602,7 @@ with block:
1476
  None, # input_image
1477
  None, # end_image
1478
  0, # image_position
 
1479
  "Overcrowed street in Japan, photorealistic, realistic, intricate details, 8k, insanely detailed",
1480
  "text", # generation_mode
1481
  "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth", # n_prompt
@@ -1511,6 +1638,7 @@ with block:
1511
  "./img_examples/Example1.png", # input_image
1512
  None, # end_image
1513
  0, # image_position
 
1514
  "A dolphin emerges from the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
1515
  "image", # generation_mode
1516
  "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth", # n_prompt
@@ -1535,6 +1663,7 @@ with block:
1535
  "./img_examples/Example2.webp", # input_image
1536
  None, # end_image
1537
  0, # image_position
 
1538
  "A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The man talks and the woman listens; A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks, the man stops talking and the man listens; A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks and the man listens",
1539
  "image", # generation_mode
1540
  "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth", # n_prompt
@@ -1559,6 +1688,7 @@ with block:
1559
  "./img_examples/Example2.webp", # input_image
1560
  None, # end_image
1561
  0, # image_position
 
1562
  "A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks and the man listens; A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The man talks, the woman stops talking and the woman listens A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The man talks and the woman listens",
1563
  "image", # generation_mode
1564
  "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth", # n_prompt
@@ -1583,6 +1713,7 @@ with block:
1583
  "./img_examples/Example3.jpg", # input_image
1584
  None, # end_image
1585
  0, # image_position
 
1586
  "एउटा केटा दायाँतिर हिँडिरहेको छ, पूर्ण दृश्य, पूर्ण-लम्बाइको दृश्य, कार्टुन",
1587
  "image", # generation_mode
1588
  "हात छुटेको, लामो हात, अवास्तविक स्थिति, असम्भव विकृति, देखिने हड्डी, मांसपेशी संकुचन, कमजोर फ्रेम, धमिलो, धमिलो, अत्यधिक चिल्लो", # n_prompt
@@ -1607,6 +1738,7 @@ with block:
1607
  "./img_examples/Example4.webp", # input_image
1608
  None, # end_image
1609
  100, # image_position
 
1610
  "A building starting to explode, photorealistic, realisitc, 8k, insanely detailed",
1611
  "image", # generation_mode
1612
  "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth", # n_prompt
@@ -1642,9 +1774,10 @@ with block:
1642
  "./img_examples/Example5.png", # input_image
1643
  "./img_examples/Example6.png", # end_image
1644
  0, # image_position
1645
- "A woman jumps out of the train and arrives on the ground, viewed from the outside, photorealistic, realistic, amateur photography, midday, insanely detailed, 8k",
 
1646
  "start_end", # generation_mode
1647
- "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth", # n_prompt
1648
  True, # randomize_seed
1649
  42, # seed
1650
  True, # auto_allocation
@@ -1675,8 +1808,36 @@ with block:
1675
  examples = [
1676
  [
1677
  "./img_examples/Example1.mp4", # input_video
 
 
1678
  "View of the sea as far as the eye can see, from the seaside, a piece of land is barely visible on the horizon at the middle, the sky is radiant, reflections of the sun in the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
1679
- "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth", # n_prompt
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1680
  True, # randomize_seed
1681
  42, # seed
1682
  True, # auto_allocation
@@ -1734,6 +1895,7 @@ with block:
1734
  gr.update(visible = False), # image_position
1735
  gr.update(visible = False), # input_image
1736
  gr.update(visible = False), # end_image
 
1737
  gr.update(visible = False), # input_video
1738
  gr.update(visible = True), # start_button
1739
  gr.update(visible = False), # start_button_video
@@ -1750,6 +1912,7 @@ with block:
1750
  gr.update(visible = True), # image_position
1751
  gr.update(visible = True), # input_image
1752
  gr.update(visible = False), # end_image
 
1753
  gr.update(visible = False), # input_video
1754
  gr.update(visible = True), # start_button
1755
  gr.update(visible = False), # start_button_video
@@ -1766,6 +1929,7 @@ with block:
1766
  gr.update(visible = False), # image_position
1767
  gr.update(visible = True), # input_image
1768
  gr.update(visible = True), # end_image
 
1769
  gr.update(visible = False), # input_video
1770
  gr.update(visible = True), # start_button
1771
  gr.update(visible = False), # start_button_video
@@ -1781,7 +1945,8 @@ with block:
1781
  gr.update(visible = False), # text_to_video_hint
1782
  gr.update(visible = False), # image_position
1783
  gr.update(visible = False), # input_image
1784
- gr.update(visible = False), # end_image
 
1785
  gr.update(visible = True), # input_video
1786
  gr.update(visible = False), # start_button
1787
  gr.update(visible = True), # start_button_video
@@ -1813,7 +1978,7 @@ with block:
1813
  generation_mode.change(
1814
  fn=handle_generation_mode_change,
1815
  inputs=[generation_mode],
1816
- outputs=[text_to_video_hint, image_position, input_image, end_image, input_video, start_button, start_button_video, no_resize, batch, num_clean_frames, vae_batch, prompt_hint, fps_number]
1817
  )
1818
 
1819
  # Update display when the page loads
@@ -1821,7 +1986,7 @@ with block:
1821
  fn=handle_generation_mode_change, inputs = [
1822
  generation_mode
1823
  ], outputs = [
1824
- text_to_video_hint, image_position, input_image, end_image, input_video, start_button, start_button_video, no_resize, batch, num_clean_frames, vae_batch, prompt_hint, fps_number
1825
  ]
1826
  )
1827
 
 
10
  class spaces():
11
  def GPU(*args, **kwargs):
12
  def decorator(function):
13
+ return lambda *dummy_args, **dummy_kwargs: function(*dummy_args, **dummy_kwargs)
 
 
14
  return decorator
15
 
16
  import gradio as gr
 
22
  import random
23
  import time
24
  import math
25
+ import gc
26
  # 20250506 pftq: Added for video input loading
27
  import decord
28
  # 20250506 pftq: Added for progress bars in video_encode
 
308
  print(f"Error saving prompt to video metadata, ffmpeg may be required: "+str(e))
309
  return False
310
 
311
+ # 20250507 pftq: New function to encode a single image (end frame)
312
  @torch.no_grad()
313
+ def image_encode(image_np, target_width, target_height, vae, image_encoder, feature_extractor, device="cuda"):
314
+ """
315
+ Encode a single image into a latent and compute its CLIP vision embedding.
316
+
317
+ Args:
318
+ image_np: Input image as numpy array.
319
+ target_width, target_height: Exact resolution to resize the image to (matches start frame).
320
+ vae: AutoencoderKLHunyuanVideo model.
321
+ image_encoder: SiglipVisionModel for CLIP vision encoding.
322
+ feature_extractor: SiglipImageProcessor for preprocessing.
323
+ device: Device for computation (e.g., "cuda").
324
+
325
+ Returns:
326
+ latent: Latent representation of the image (shape: [1, channels, 1, height//8, width//8]).
327
+ clip_embedding: CLIP vision embedding of the image.
328
+ processed_image_np: Processed image as numpy array (after resizing).
329
+ """
330
+ # 20250507 pftq: Process end frame with exact start frame dimensions
331
+ print("Processing end frame...")
332
+ try:
333
+ print(f"Using exact start frame resolution for end frame: {target_width}x{target_height}")
334
+
335
+ # Resize and preprocess image to match start frame
336
+ processed_image_np = resize_and_center_crop(image_np, target_width=target_width, target_height=target_height)
337
+
338
+ # Convert to tensor and normalize
339
+ image_pt = torch.from_numpy(processed_image_np).float() / 127.5 - 1
340
+ image_pt = image_pt.permute(2, 0, 1).unsqueeze(0).unsqueeze(2) # Shape: [1, channels, 1, height, width]
341
+ image_pt = image_pt.to(device)
342
+
343
+ # Move VAE to device
344
+ vae.to(device)
345
+
346
+ # Encode to latent
347
+ latent = vae_encode(image_pt, vae)
348
+ print(f"image_encode vae output shape: {latent.shape}")
349
+
350
+ # Move image encoder to device
351
+ image_encoder.to(device)
352
+
353
+ # Compute CLIP vision embedding
354
+ clip_embedding = hf_clip_vision_encode(processed_image_np, feature_extractor, image_encoder).last_hidden_state
355
+
356
+ # Move models back to CPU and clear cache
357
+ if device == "cuda":
358
+ vae.to(cpu)
359
+ image_encoder.to(cpu)
360
+ torch.cuda.empty_cache()
361
+ print("VAE and image encoder moved back to CPU, CUDA cache cleared")
362
+
363
+ print(f"End latent shape: {latent.shape}")
364
+ return latent, clip_embedding, processed_image_np
365
+
366
+ except Exception as e:
367
+ print(f"Error in image_encode: {str(e)}")
368
+ raise
369
+
370
+ @torch.no_grad()
371
+ def worker(input_image, end_image, image_position, end_stillness, prompts, n_prompt, seed, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number):
372
  def encode_prompt(prompt, n_prompt):
373
  llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
374
 
 
463
  return [start_latent, image_encoder_last_hidden_state]
464
 
465
  [start_latent, image_encoder_last_hidden_state] = get_start_latent(input_image, height, width, vae, gpu, image_encoder, high_vram)
466
+ del input_image
467
+ del end_image
468
 
469
  # Dtype
470
 
 
560
  [llama_vec, clip_l_pooler, llama_vec_n, clip_l_pooler_n, llama_attention_mask, llama_attention_mask_n] = prompt_parameters[prompt_index]
561
 
562
  if prompt_index < len(prompt_parameters) - 1 or (prompt_index == total_latent_sections - 1):
563
+ del prompt_parameters[prompt_index]
564
 
565
  if not high_vram:
566
  unload_complete_models()
 
608
  clean_latent_4x_indices=clean_latent_4x_indices,
609
  callback=callback,
610
  )
611
+ del clean_latents
612
+ del clean_latents_2x
613
+ del clean_latents_4x
614
+ del latent_indices
615
+ del clean_latent_indices
616
+ del clean_latent_2x_indices
617
+ del clean_latent_4x_indices
618
 
619
  [total_generated_latent_frames, history_latents, history_pixels] = post_process(forward, generated_latents, total_generated_latent_frames, history_latents, high_vram, transformer, gpu, vae, history_pixels, latent_window_size, enable_preview, section_index, total_latent_sections, outputs_folder, mp4_crf, stream)
620
 
 
628
  real_history_latents = history_latents[:, :, :total_generated_latent_frames, :, :]
629
  zero_latents = history_latents[:, :, total_generated_latent_frames:, :, :]
630
  history_latents = torch.cat([zero_latents, real_history_latents], dim=2)
631
+ del real_history_latents
632
+ del zero_latents
633
 
634
  forward = True
635
  section_index = first_section_index
 
648
  return
649
 
650
  @torch.no_grad()
651
+ def worker_start_end(input_image, end_image, image_position, end_stillness, prompts, n_prompt, seed, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number):
652
  def encode_prompt(prompt, n_prompt):
653
  llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
654
 
 
757
  return [start_latent, end_latent, image_encoder_last_hidden_state]
758
 
759
  [start_latent, end_latent, image_encoder_last_hidden_state] = get_start_latent(input_image, has_end_image, end_image, height, width, vae, gpu, image_encoder, high_vram)
760
+ del input_image
761
+ del end_image
762
 
763
  # Dtype
764
  image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
 
778
  total_generated_latent_frames = 0
779
 
780
  if total_latent_sections > 4:
781
+ # In theory the latent_paddings should follow the else sequence, but it seems that duplicating some
782
  # items looks better than expanding it when total_latent_sections > 4
783
  # One can try to remove below trick and just
784
  # use `latent_paddings = list(reversed(range(total_latent_sections)))` to compare
 
857
  if len(prompt_parameters) > 0:
858
  [llama_vec, clip_l_pooler, llama_vec_n, clip_l_pooler_n, llama_attention_mask, llama_attention_mask_n] = prompt_parameters.pop(len(prompt_parameters) - 1)
859
 
860
+ indices = torch.arange(1 + latent_padding_size + latent_window_size + 1 + (end_stillness if is_first_section else 0) + 2 + 16).unsqueeze(0)
861
+ clean_latent_indices_pre, blank_indices, latent_indices, clean_latent_indices_post, clean_latent_2x_indices, clean_latent_4x_indices = indices.split([1, latent_padding_size, latent_window_size, 1 + (end_stillness if is_first_section else 0), 2, 16], dim=1)
862
  clean_latent_indices = torch.cat([clean_latent_indices_pre, clean_latent_indices_post], dim=1)
863
 
864
  clean_latents_post, clean_latents_2x, clean_latents_4x = history_latents[:, :, :1 + 2 + 16, :, :].split([1, 2, 16], dim=2)
865
 
866
  # Use end image latent for the first section if provided
867
  if has_end_image and is_first_section:
868
+ clean_latents_post = end_latent.expand(-1, -1, 1 + end_stillness, -1, -1)
869
 
870
  clean_latents = torch.cat([start_latent, clean_latents_post], dim=2)
871
 
 
908
  clean_latent_4x_indices=clean_latent_4x_indices,
909
  callback=callback,
910
  )
911
+ del clean_latents
912
+ del clean_latents_2x
913
+ del clean_latents_4x
914
+ del latent_indices
915
+ del clean_latent_indices
916
+ del clean_latent_2x_indices
917
+ del clean_latent_4x_indices
918
 
919
  [total_generated_latent_frames, history_latents, history_pixels] = post_process(job_id, start_latent, generated_latents, total_generated_latent_frames, history_latents, high_vram, transformer, gpu, vae, history_pixels, latent_window_size, enable_preview, outputs_folder, mp4_crf, stream, is_last_section)
920
 
 
933
 
934
  # 20250506 pftq: Modified worker to accept video input and clean frame count
935
  @torch.no_grad()
936
+ def worker_video(input_video, end_frame, end_stillness, prompts, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
937
  def encode_prompt(prompt, n_prompt):
938
  llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
939
 
 
959
 
960
  # 20250506 pftq: Encode video
961
  start_latent, input_image_np, video_latents, fps, height, width = video_encode(input_video, resolution, no_resize, vae, vae_batch_size=vae_batch, device=gpu)
962
+ del input_video
963
  start_latent = start_latent.to(dtype=torch.float32, device=cpu)
964
  video_latents = video_latents.cpu()
965
 
 
997
  load_model_as_complete(image_encoder, target_device=gpu)
998
 
999
  image_encoder_output = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder)
1000
+ del input_image_np
1001
+
1002
+ # 20250507 pftq: Process end frame if provided
1003
+ if end_frame is not None:
1004
+ if not high_vram:
1005
+ load_model_as_complete(vae, target_device=gpu)
1006
+
1007
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'End frame encoding ...'))))
1008
+ end_latent = image_encode(
1009
+ end_frame, target_width=width, target_height=height, vae=vae,
1010
+ image_encoder=image_encoder, feature_extractor=feature_extractor, device=gpu
1011
+ )[0]
1012
+ del end_frame
1013
+ end_latent = end_latent.to(dtype=torch.float32, device=cpu)
1014
+ else:
1015
+ end_latent = None
1016
 
1017
  # Clean GPU
1018
  if not high_vram:
1019
+ unload_complete_models(image_encoder, vae)
1020
 
1021
  image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
1022
+ del image_encoder_output
1023
 
1024
  # Dtype
1025
  image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
 
1046
  def callback(d):
1047
  return
1048
 
1049
+ def compute_latent(history_latents, latent_window_size, latent_padding_size, num_clean_frames, start_latent, end_latent, end_stillness, is_end_of_video):
1050
+ if is_end_of_video:
1051
+ local_end_stillness = end_stillness
1052
+ local_end_latent = end_latent.expand(-1, -1, 1 + local_end_stillness, -1, -1)
1053
+ else:
1054
+ local_end_stillness = 0
1055
+ local_end_latent = end_latent
1056
  # 20250506 pftq: Use user-specified number of context frames, matching original allocation for num_clean_frames=2
1057
  available_frames = history_latents.shape[2] # Number of latent frames
1058
  max_pixel_frames = min(latent_window_size * 4 - 3, available_frames * 4) # Cap at available pixel frames
 
1066
  total_context_frames = num_4x_frames + num_2x_frames + effective_clean_frames
1067
  total_context_frames = min(total_context_frames, available_frames) # 20250507 pftq: Edge case for <=1 sec videos
1068
 
1069
+ post_frames = 100 # Single frame for end_latent, otherwise padding causes still image
1070
+ indices = torch.arange(0, 1 + num_4x_frames + num_2x_frames + effective_clean_frames + adjusted_latent_frames + ((latent_padding_size + 1 + local_end_stillness) if end_latent is not None else 0)).unsqueeze(0) # 20250507 pftq: latent_window_size to adjusted_latent_frames for edge case for <=1 sec videos
1071
+ clean_latent_indices_start, clean_latent_4x_indices, clean_latent_2x_indices, clean_latent_1x_indices, latent_indices, blank_indices, clean_latent_indices_post = indices.split(
1072
+ [1, num_4x_frames, num_2x_frames, effective_clean_frames, adjusted_latent_frames, latent_padding_size if end_latent is not None else 0, (1 + local_end_stillness) if end_latent is not None else 0], dim=1 # 20250507 pftq: latent_window_size to adjusted_latent_frames for edge case for <=1 sec videos
1073
  )
1074
+ clean_latent_indices = torch.cat([clean_latent_indices_start, clean_latent_1x_indices, clean_latent_indices_post], dim=1)
1075
 
1076
  # 20250506 pftq: Split history_latents dynamically based on available frames
1077
  fallback_frame_count = 2 # 20250507 pftq: Changed 0 to 2 Edge case for <=1 sec videos
 
1104
  if effective_clean_frames > 0 and split_idx < len(splits):
1105
  clean_latents_1x = splits[split_idx]
1106
 
1107
+ if end_latent is not None:
1108
+ clean_latents = torch.cat([start_latent, clean_latents_1x, local_end_latent], dim=2)
1109
+ else:
1110
+ clean_latents = torch.cat([start_latent, clean_latents_1x], dim=2)
1111
 
1112
  # 20250507 pftq: Fix for <=1 sec videos.
1113
  max_frames = min(latent_window_size * 4 - 3, history_latents.shape[2] * 4)
 
1129
  history_latents = video_latents
1130
  total_generated_latent_frames = history_latents.shape[2]
1131
  # 20250506 pftq: Initialize history_pixels to fix UnboundLocalError
1132
+ history_pixels = previous_video = None
1133
+
1134
+ # 20250509 Generate backwards with end frame for better end frame anchoring
1135
+ if total_latent_sections > 4:
1136
+ latent_paddings = [3] + [2] * (total_latent_sections - 3) + [1, 0]
1137
+ else:
1138
+ latent_paddings = list(reversed(range(total_latent_sections)))
1139
 
1140
+ for section_index, latent_padding in enumerate(latent_paddings):
1141
+ is_start_of_video = latent_padding == 0
1142
+ is_end_of_video = latent_padding == latent_paddings[0]
1143
+ latent_padding_size = latent_padding * latent_window_size
1144
  if stream.input_queue.top() == 'end':
1145
  stream.output_queue.push(('end', None))
1146
  return
 
1159
  else:
1160
  transformer.initialize_teacache(enable_teacache=False)
1161
 
1162
+ [max_frames, clean_latents, clean_latents_2x, clean_latents_4x, latent_indices, clean_latents, clean_latent_indices, clean_latent_2x_indices, clean_latent_4x_indices] = compute_latent(history_latents, latent_window_size, latent_padding_size, num_clean_frames, start_latent, end_latent, end_stillness, is_end_of_video)
1163
 
1164
  generated_latents = sample_hunyuan(
1165
  transformer=transformer,
 
1190
  clean_latent_4x_indices=clean_latent_4x_indices,
1191
  callback=callback,
1192
  )
1193
+ del clean_latents
1194
+ del clean_latents_2x
1195
+ del clean_latents_4x
1196
+ del latent_indices
1197
+ del clean_latent_indices
1198
+ del clean_latent_2x_indices
1199
+ del clean_latent_4x_indices
1200
 
1201
  total_generated_latent_frames += int(generated_latents.shape[2])
1202
  history_latents = torch.cat([history_latents, generated_latents.to(history_latents)], dim=2)
 
1254
  stream.output_queue.push(('end', None))
1255
  return
1256
 
1257
+ def get_duration(input_image, end_image, image_position, end_stillness, prompts, generation_mode, n_prompt, seed, resolution, total_second_length, allocation_time, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number):
1258
  return allocation_time
1259
 
1260
  @spaces.GPU(duration=get_duration)
1261
+ def process_on_gpu(input_image, end_image, image_position, end_stillness, prompts, generation_mode, n_prompt, seed, resolution, total_second_length, allocation_time, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number
1262
  ):
1263
  start = time.time()
1264
  global stream
1265
  stream = AsyncStream()
1266
 
1267
+ async_run(worker_start_end if generation_mode == "start_end" else worker, input_image, end_image, image_position, end_stillness, prompts, n_prompt, seed, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number)
1268
 
1269
  output_filename = None
1270
 
 
1290
  ((str(hours) + " h, ") if hours != 0 else "") + \
1291
  ((str(minutes) + " min, ") if hours != 0 or minutes != 0 else "") + \
1292
  str(secondes) + " sec. " + \
1293
+ "You can upscale the result with https://huggingface.co/spaces/Nick088/Real-ESRGAN_Pytorch. To make all your generated scenes consistent, you can then apply a face swap on the main character. If you do not see the generated video above, the process may have failed. See the logs for more information. If you see an error like ''NVML_SUCCESS == r INTERNAL ASSERT FAILED'', you probably haven't enough VRAM. Test an example or other options to compare. You can share your inputs to the original space or set your space in public for a peer review.", gr.update(interactive=True), gr.update(interactive=False), gr.update(visible = False)
1294
  break
1295
 
1296
  def process(input_image,
1297
  end_image,
1298
  image_position=0,
1299
+ end_stillness=1,
1300
  prompt="",
1301
  generation_mode="image",
1302
  n_prompt="",
 
1330
 
1331
  prompts = prompt.split(";")
1332
 
 
1333
  if generation_mode == "text":
1334
+ default_height, default_width = resolution, resolution
1335
  input_image = np.ones((default_height, default_width, 3), dtype=np.uint8) * 255
1336
  print("No input image provided. Using a blank white image.")
1337
+ assert input_image is not None, 'No input image!'
1338
+ assert (generation_mode != "start_end") or end_image is not None, 'No end image!'
1339
 
1340
  yield gr.update(label="Previewed Frames"), None, '', '', gr.update(interactive=False), gr.update(interactive=True), gr.skip()
1341
 
1342
+ gc.collect()
1343
  yield from process_on_gpu(input_image,
1344
  end_image,
1345
  image_position,
1346
+ end_stillness,
1347
  prompts,
1348
  generation_mode,
1349
  n_prompt,
 
1363
  fps_number
1364
  )
1365
 
1366
+ def get_duration_video(input_video, end_frame, end_stillness, prompts, n_prompt, seed, batch, resolution, total_second_length, allocation_time, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
1367
  return allocation_time
1368
 
1369
  @spaces.GPU(duration=get_duration_video)
1370
+ def process_video_on_gpu(input_video, end_frame, end_stillness, prompts, n_prompt, seed, batch, resolution, total_second_length, allocation_time, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
1371
  start = time.time()
1372
  global stream
1373
  stream = AsyncStream()
1374
 
1375
  # 20250506 pftq: Pass num_clean_frames, vae_batch, etc
1376
+ async_run(worker_video, input_video, end_frame, end_stillness, prompts, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch)
1377
 
1378
  output_filename = None
1379
 
 
1400
  ((str(hours) + " h, ") if hours != 0 else "") + \
1401
  ((str(minutes) + " min, ") if hours != 0 or minutes != 0 else "") + \
1402
  str(secondes) + " sec. " + \
1403
+ " You can upscale the result with https://huggingface.co/spaces/Nick088/Real-ESRGAN_Pytorch. To make all your generated scenes consistent, you can then apply a face swap on the main character. If you do not see the generated video above, the process may have failed. See the logs for more information. If you see an error like ''NVML_SUCCESS == r INTERNAL ASSERT FAILED'', you probably haven't enough VRAM. Test an example or other options to compare. You can share your inputs to the original space or set your space in public for a peer review.", '', gr.update(interactive=True), gr.update(interactive=False), gr.update(visible = False)
1404
  break
1405
 
1406
+ def process_video(input_video, end_frame, end_stillness, prompt, n_prompt, randomize_seed, seed, auto_allocation, allocation_time, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
1407
  global high_vram
1408
  if auto_allocation:
1409
  allocation_time = min(total_second_length * 60 * (2.5 if use_teacache else 3.5) * (1 + ((steps - 25) / 25))**2, 600)
 
1436
  if cfg > 1:
1437
  gs = 1
1438
 
1439
+ gc.collect()
1440
+ yield from process_video_on_gpu(input_video, end_frame, end_stillness, prompt, n_prompt, seed, batch, resolution, total_second_length, allocation_time, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch)
1441
 
1442
  def end_process():
1443
  stream.input_queue.push('end')
 
1507
  local_storage = gr.BrowserState(default_local_storage)
1508
  with gr.Row():
1509
  with gr.Column():
1510
+ generation_mode = gr.Radio([["Text-to-Video", "text"], ["Image-to-Video", "image"], ["Start & end frames", "start_end"], ["Video Extension", "video"]], elem_id="generation-mode", label="Input mode", value = "image")
1511
  text_to_video_hint = gr.HTML("Text-to-Video badly works with a flash effect at the start. I discourage to use the Text-to-Video feature. You should rather generate an image with Flux and use Image-to-Video. You will save time.")
1512
  input_image = gr.Image(sources='upload', type="numpy", label="Image", height=320)
 
1513
  image_position = gr.Slider(label="Image position", minimum=0, maximum=100, value=0, step=1, info='0=Video start; 100=Video end (lower quality)')
1514
  input_video = gr.Video(sources='upload', label="Input Video", height=320)
1515
+ end_image = gr.Image(sources='upload', type="numpy", label="End Frame (optional)", height=320)
1516
  timeless_prompt = gr.Textbox(label="Timeless prompt", info='Used on the whole duration of the generation', value='', placeholder="The creature starts to move, fast motion, fixed camera, focus motion, consistent arm, consistent position, mute colors, insanely detailed")
1517
  prompt_number = gr.Slider(label="Timed prompt number", minimum=0, maximum=1000, value=0, step=1, info='Prompts will automatically appear')
1518
 
 
1539
  n_prompt = gr.Textbox(label="Negative Prompt", value="Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth", info='Requires using normal CFG (undistilled) instead of Distilled (set Distilled=1 and CFG > 1).')
1540
 
1541
  fps_number = gr.Slider(label="Frame per seconds", info="The model is trained for 30 fps so other fps may generate weird results", minimum=10, maximum=60, value=30, step=1)
1542
+ end_stillness = gr.Slider(label="End stillness", minimum=0, maximum=100, value=1, step=1, info='0=Realistic end; >0=Matches exactly the end image (but the time seems to freeze)')
1543
 
1544
  latent_window_size = gr.Slider(label="Latent Window Size", minimum=1, maximum=33, value=9, step=1, info='Generate more frames at a time (larger chunks). Less degradation and better blending but higher VRAM cost. Should not change.')
1545
  steps = gr.Slider(label="Steps", minimum=1, maximum=100, value=30, step=1, info='Increase for more quality, especially if using high non-distilled CFG. If your animation has very few motion, you may have brutal brightness change; this can be fixed increasing the steps.')
 
1592
  progress_desc = gr.Markdown('', elem_classes='no-generating-animation')
1593
  progress_bar = gr.HTML('', elem_classes='no-generating-animation')
1594
 
1595
+ ips = [input_image, end_image, image_position, end_stillness, final_prompt, generation_mode, n_prompt, randomize_seed, seed, auto_allocation, allocation_time, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number]
1596
+ ips_video = [input_video, end_image, end_stillness, final_prompt, n_prompt, randomize_seed, seed, auto_allocation, allocation_time, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch]
1597
 
1598
  gr.Examples(
1599
  label = "✍️ Examples from text",
 
1602
  None, # input_image
1603
  None, # end_image
1604
  0, # image_position
1605
+ 1, # end_stillness
1606
  "Overcrowed street in Japan, photorealistic, realistic, intricate details, 8k, insanely detailed",
1607
  "text", # generation_mode
1608
  "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth", # n_prompt
 
1638
  "./img_examples/Example1.png", # input_image
1639
  None, # end_image
1640
  0, # image_position
1641
+ 1, # end_stillness
1642
  "A dolphin emerges from the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
1643
  "image", # generation_mode
1644
  "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth", # n_prompt
 
1663
  "./img_examples/Example2.webp", # input_image
1664
  None, # end_image
1665
  0, # image_position
1666
+ 1, # end_stillness
1667
  "A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The man talks and the woman listens; A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks, the man stops talking and the man listens; A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks and the man listens",
1668
  "image", # generation_mode
1669
  "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth", # n_prompt
 
1688
  "./img_examples/Example2.webp", # input_image
1689
  None, # end_image
1690
  0, # image_position
1691
+ 1, # end_stillness
1692
  "A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks and the man listens; A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The man talks, the woman stops talking and the woman listens A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The man talks and the woman listens",
1693
  "image", # generation_mode
1694
  "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth", # n_prompt
 
1713
  "./img_examples/Example3.jpg", # input_image
1714
  None, # end_image
1715
  0, # image_position
1716
+ 1, # end_stillness
1717
  "एउटा केटा दायाँतिर हिँडिरहेको छ, पूर्ण दृश्य, पूर्ण-लम्बाइको दृश्य, कार्टुन",
1718
  "image", # generation_mode
1719
  "हात छुटेको, लामो हात, अवास्तविक स्थिति, असम्भव विकृति, देखिने हड्डी, मांसपेशी संकुचन, कमजोर फ्रेम, धमिलो, धमिलो, अत्यधिक चिल्लो", # n_prompt
 
1738
  "./img_examples/Example4.webp", # input_image
1739
  None, # end_image
1740
  100, # image_position
1741
+ 1, # end_stillness
1742
  "A building starting to explode, photorealistic, realisitc, 8k, insanely detailed",
1743
  "image", # generation_mode
1744
  "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth", # n_prompt
 
1774
  "./img_examples/Example5.png", # input_image
1775
  "./img_examples/Example6.png", # end_image
1776
  0, # image_position
1777
+ 0, # end_stillness
1778
+ "A woman jumps out of the train and arrives on the ground, viewed from the outside, photorealistic, realistic, amateur photography, midday, insanely detailed, 8k", # prompt
1779
  "start_end", # generation_mode
1780
+ "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth, jumpcut, crossfader, crossfading", # n_prompt
1781
  True, # randomize_seed
1782
  42, # seed
1783
  True, # auto_allocation
 
1808
  examples = [
1809
  [
1810
  "./img_examples/Example1.mp4", # input_video
1811
+ None, # end_image
1812
+ 1, # end_stillness
1813
  "View of the sea as far as the eye can see, from the seaside, a piece of land is barely visible on the horizon at the middle, the sky is radiant, reflections of the sun in the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
1814
+ "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth, jumpcut, crossfader, crossfading", # n_prompt
1815
+ True, # randomize_seed
1816
+ 42, # seed
1817
+ True, # auto_allocation
1818
+ 180, # allocation_time
1819
+ 1, # batch
1820
+ 672, # resolution
1821
+ 1, # total_second_length
1822
+ 9, # latent_window_size
1823
+ 30, # steps
1824
+ 1.0, # cfg
1825
+ 10.0, # gs
1826
+ 0.0, # rs
1827
+ 6, # gpu_memory_preservation
1828
+ False, # enable_preview
1829
+ True, # use_teacache
1830
+ False, # no_resize
1831
+ 16, # mp4_crf
1832
+ 5, # num_clean_frames
1833
+ default_vae
1834
+ ],
1835
+ [
1836
+ "./img_examples/Example1.mp4", # input_video
1837
+ "./img_examples/Example1.png", # end_image
1838
+ 1, # end_stillness
1839
+ "View of the sea as far as the eye can see, from the seaside, a piece of land is barely visible on the horizon at the middle, the sky is radiant, reflections of the sun in the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
1840
+ "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth, jumpcut, crossfader, crossfading", # n_prompt
1841
  True, # randomize_seed
1842
  42, # seed
1843
  True, # auto_allocation
 
1895
  gr.update(visible = False), # image_position
1896
  gr.update(visible = False), # input_image
1897
  gr.update(visible = False), # end_image
1898
+ gr.update(visible = False), # end_stillness
1899
  gr.update(visible = False), # input_video
1900
  gr.update(visible = True), # start_button
1901
  gr.update(visible = False), # start_button_video
 
1912
  gr.update(visible = True), # image_position
1913
  gr.update(visible = True), # input_image
1914
  gr.update(visible = False), # end_image
1915
+ gr.update(visible = False), # end_stillness
1916
  gr.update(visible = False), # input_video
1917
  gr.update(visible = True), # start_button
1918
  gr.update(visible = False), # start_button_video
 
1929
  gr.update(visible = False), # image_position
1930
  gr.update(visible = True), # input_image
1931
  gr.update(visible = True), # end_image
1932
+ gr.update(visible = True), # end_stillness
1933
  gr.update(visible = False), # input_video
1934
  gr.update(visible = True), # start_button
1935
  gr.update(visible = False), # start_button_video
 
1945
  gr.update(visible = False), # text_to_video_hint
1946
  gr.update(visible = False), # image_position
1947
  gr.update(visible = False), # input_image
1948
+ gr.update(visible = True), # end_image
1949
+ gr.update(visible = True), # end_stillness
1950
  gr.update(visible = True), # input_video
1951
  gr.update(visible = False), # start_button
1952
  gr.update(visible = True), # start_button_video
 
1978
  generation_mode.change(
1979
  fn=handle_generation_mode_change,
1980
  inputs=[generation_mode],
1981
+ outputs=[text_to_video_hint, image_position, input_image, end_image, end_stillness, input_video, start_button, start_button_video, no_resize, batch, num_clean_frames, vae_batch, prompt_hint, fps_number]
1982
  )
1983
 
1984
  # Update display when the page loads
 
1986
  fn=handle_generation_mode_change, inputs = [
1987
  generation_mode
1988
  ], outputs = [
1989
+ text_to_video_hint, image_position, input_image, end_image, end_stillness, input_video, start_button, start_button_video, no_resize, batch, num_clean_frames, vae_batch, prompt_hint, fps_number
1990
  ]
1991
  )
1992