Fabrice-TIERCELIN commited on
Commit
76d0f82
·
verified ·
1 Parent(s): 1bd869c

Upload 2 files

Browse files
Files changed (2) hide show
  1. README.md +1 -1
  2. app.py +521 -102
README.md CHANGED
@@ -5,7 +5,7 @@ colorFrom: pink
5
  colorTo: gray
6
  sdk: gradio
7
  sdk_version: 5.29.1
8
- app_file: app_start_end.py
9
  license: apache-2.0
10
  short_description: Text-to-Video/Image-to-Video/Video extender (timed prompt)
11
  tags:
 
5
  colorTo: gray
6
  sdk: gradio
7
  sdk_version: 5.29.1
8
+ app_file: app.py
9
  license: apache-2.0
10
  short_description: Text-to-Video/Image-to-Video/Video extender (timed prompt)
11
  tags:
app.py CHANGED
@@ -13,9 +13,10 @@ import torch
13
  import traceback
14
  import einops
15
  import safetensors.torch as sf
16
- import numpy as np
17
  import random
18
  import time
 
 
19
  import math
20
  # 20250506 pftq: Added for video input loading
21
  import decord
@@ -38,74 +39,77 @@ from diffusers_helper.hunyuan import encode_prompt_conds, vae_decode, vae_encode
38
  from diffusers_helper.utils import save_bcthw_as_mp4, crop_or_pad_yield_mask, soft_append_bcthw, resize_and_center_crop, state_dict_weighted_merge, state_dict_offset_merge, generate_timestamp
39
  from diffusers_helper.models.hunyuan_video_packed import HunyuanVideoTransformer3DModelPacked
40
  from diffusers_helper.pipelines.k_diffusion_hunyuan import sample_hunyuan
41
- if torch.cuda.device_count() > 0:
42
- from diffusers_helper.memory import cpu, gpu, get_cuda_free_memory_gb, move_model_to_device_with_memory_preservation, offload_model_from_device_for_memory_preservation, fake_diffusers_current_device, DynamicSwapInstaller, unload_complete_models, load_model_as_complete
43
  from diffusers_helper.thread_utils import AsyncStream, async_run
44
  from diffusers_helper.gradio.progress_bar import make_progress_bar_css, make_progress_bar_html
45
  from transformers import SiglipImageProcessor, SiglipVisionModel
46
  from diffusers_helper.clip_vision import hf_clip_vision_encode
47
  from diffusers_helper.bucket_tools import find_nearest_bucket
48
- from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig, HunyuanVideoTransformer3DModel, HunyuanVideoPipeline
49
- import pillow_heif
50
-
51
- pillow_heif.register_heif_opener()
52
-
53
- high_vram = False
54
- free_mem_gb = 0
55
-
56
- if torch.cuda.device_count() > 0:
57
- free_mem_gb = get_cuda_free_memory_gb(gpu)
58
- high_vram = free_mem_gb > 60
59
-
60
- #print(f'Free VRAM {free_mem_gb} GB')
61
- #print(f'High-VRAM Mode: {high_vram}')
62
-
63
- text_encoder = LlamaModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder', torch_dtype=torch.float16).cpu()
64
- text_encoder_2 = CLIPTextModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder_2', torch_dtype=torch.float16).cpu()
65
- tokenizer = LlamaTokenizerFast.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer')
66
- tokenizer_2 = CLIPTokenizer.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer_2')
67
- vae = AutoencoderKLHunyuanVideo.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='vae', torch_dtype=torch.float16).cpu()
68
-
69
- feature_extractor = SiglipImageProcessor.from_pretrained("lllyasviel/flux_redux_bfl", subfolder='feature_extractor')
70
- image_encoder = SiglipVisionModel.from_pretrained("lllyasviel/flux_redux_bfl", subfolder='image_encoder', torch_dtype=torch.float16).cpu()
71
-
72
- transformer = HunyuanVideoTransformer3DModelPacked.from_pretrained('lllyasviel/FramePack_F1_I2V_HY_20250503', torch_dtype=torch.bfloat16).cpu()
73
 
74
- vae.eval()
75
- text_encoder.eval()
76
- text_encoder_2.eval()
77
- image_encoder.eval()
78
- transformer.eval()
79
 
80
- if not high_vram:
81
- vae.enable_slicing()
82
- vae.enable_tiling()
83
-
84
- transformer.high_quality_fp32_output_for_inference = True
85
- #print('transformer.high_quality_fp32_output_for_inference = True')
86
-
87
- transformer.to(dtype=torch.bfloat16)
88
- vae.to(dtype=torch.float16)
89
- image_encoder.to(dtype=torch.float16)
90
- text_encoder.to(dtype=torch.float16)
91
- text_encoder_2.to(dtype=torch.float16)
92
-
93
- vae.requires_grad_(False)
94
- text_encoder.requires_grad_(False)
95
- text_encoder_2.requires_grad_(False)
96
- image_encoder.requires_grad_(False)
97
- transformer.requires_grad_(False)
98
-
99
- if not high_vram:
100
- # DynamicSwapInstaller is same as huggingface's enable_sequential_offload but 3x faster
101
- DynamicSwapInstaller.install_model(transformer, device=gpu)
102
- DynamicSwapInstaller.install_model(text_encoder, device=gpu)
103
- else:
104
- text_encoder.to(gpu)
105
- text_encoder_2.to(gpu)
106
- image_encoder.to(gpu)
107
- vae.to(gpu)
108
- transformer.to(gpu)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
 
110
  stream = AsyncStream()
111
 
@@ -114,6 +118,7 @@ os.makedirs(outputs_folder, exist_ok=True)
114
 
115
  input_image_debug_value = [None]
116
  input_video_debug_value = [None]
 
117
  prompt_debug_value = [None]
118
  total_second_length_debug_value = [None]
119
 
@@ -308,7 +313,7 @@ def set_mp4_comments_imageio_ffmpeg(input_file, comments):
308
  return False
309
 
310
  @torch.no_grad()
311
- def worker(input_image, image_position, prompts, n_prompt, seed, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number):
312
  def encode_prompt(prompt, n_prompt):
313
  llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
314
 
@@ -577,6 +582,269 @@ def worker(input_image, image_position, prompts, n_prompt, seed, resolution, tot
577
  stream.output_queue.push(('end', None))
578
  return
579
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
580
  # 20250506 pftq: Modified worker to accept video input and clean frame count
581
  @torch.no_grad()
582
  def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
@@ -857,18 +1125,18 @@ def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_
857
  stream.output_queue.push(('end', None))
858
  return
859
 
860
- def get_duration(input_image, image_position, prompts, generation_mode, n_prompt, seed, resolution, total_second_length, allocation_time, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number):
861
  return allocation_time
862
 
863
  # Remove this decorator if you run on local
864
  @spaces.GPU(duration=get_duration)
865
- def process_on_gpu(input_image, image_position, prompts, generation_mode, n_prompt, seed, resolution, total_second_length, allocation_time, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number
866
  ):
867
  start = time.time()
868
  global stream
869
  stream = AsyncStream()
870
 
871
- async_run(worker, input_image, image_position, prompts, n_prompt, seed, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number)
872
 
873
  output_filename = None
874
 
@@ -899,6 +1167,7 @@ def process_on_gpu(input_image, image_position, prompts, generation_mode, n_prom
899
 
900
  def process(input_image,
901
  image_position=0,
 
902
  prompt="",
903
  generation_mode="image",
904
  n_prompt="",
@@ -909,12 +1178,12 @@ def process(input_image,
909
  resolution=640,
910
  total_second_length=5,
911
  latent_window_size=9,
912
- steps=25,
913
  cfg=1.0,
914
  gs=10.0,
915
  rs=0.0,
916
  gpu_memory_preservation=6,
917
- enable_preview=True,
918
  use_teacache=False,
919
  mp4_crf=16,
920
  fps_number=30
@@ -922,12 +1191,13 @@ def process(input_image,
922
  if auto_allocation:
923
  allocation_time = min(total_second_length * 60 * (1.5 if use_teacache else 3.0) * (1 + ((steps - 25) / 25))**2, 600)
924
 
925
- if input_image_debug_value[0] is not None or prompt_debug_value[0] is not None or total_second_length_debug_value[0] is not None:
926
  input_image = input_image_debug_value[0]
 
927
  prompt = prompt_debug_value[0]
928
  total_second_length = total_second_length_debug_value[0]
929
  allocation_time = min(total_second_length_debug_value[0] * 60 * 100, 600)
930
- input_image_debug_value[0] = prompt_debug_value[0] = total_second_length_debug_value[0] = None
931
 
932
  if torch.cuda.device_count() == 0:
933
  gr.Warning('Set this space to GPU config to make it work.')
@@ -949,6 +1219,7 @@ def process(input_image,
949
 
950
  yield from process_on_gpu(input_image,
951
  image_position,
 
952
  prompts,
953
  generation_mode,
954
  n_prompt,
@@ -1019,7 +1290,7 @@ def process_video(input_video, prompt, n_prompt, randomize_seed, seed, auto_allo
1019
  prompt = prompt_debug_value[0]
1020
  total_second_length = total_second_length_debug_value[0]
1021
  allocation_time = min(total_second_length_debug_value[0] * 60 * 100, 600)
1022
- input_video_debug_value[0] = prompt_debug_value[0] = total_second_length_debug_value[0] = None
1023
 
1024
  if torch.cuda.device_count() == 0:
1025
  gr.Warning('Set this space to GPU config to make it work.')
@@ -1119,9 +1390,10 @@ with block:
1119
  local_storage = gr.BrowserState(default_local_storage)
1120
  with gr.Row():
1121
  with gr.Column():
1122
- generation_mode = gr.Radio([["Text-to-Video", "text"], ["Image-to-Video", "image"], ["Video Extension", "video"]], elem_id="generation-mode", label="Generation mode", value = "image")
1123
  text_to_video_hint = gr.HTML("Text-to-Video badly works with a flash effect at the start. I discourage to use the Text-to-Video feature. You should rather generate an image with Flux and use Image-to-Video. You will save time.")
1124
  input_image = gr.Image(sources='upload', type="numpy", label="Image", height=320)
 
1125
  image_position = gr.Slider(label="Image position", minimum=0, maximum=100, value=0, step=1, info='0=Video start; 100=Video end (lower quality)')
1126
  input_video = gr.Video(sources='upload', label="Input Video", height=320)
1127
  timeless_prompt = gr.Textbox(label="Timeless prompt", info='Used on the whole duration of the generation', value='', placeholder="The creature starts to move, fast motion, fixed camera, focus motion, consistent arm, consistent position, mute colors, insanely detailed")
@@ -1147,7 +1419,7 @@ with block:
1147
  enable_preview = gr.Checkbox(label='Enable preview', value=True, info='Display a preview around each second generated but it costs 2 sec. for each second generated.')
1148
  use_teacache = gr.Checkbox(label='Use TeaCache', value=False, info='Faster speed and no break in brightness, but often makes hands and fingers slightly worse.')
1149
 
1150
- n_prompt = gr.Textbox(label="Negative Prompt", value="Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry, over-smooth", info='Requires using normal CFG (undistilled) instead of Distilled (set Distilled=1 and CFG > 1).')
1151
 
1152
  fps_number = gr.Slider(label="Frame per seconds", info="The model is trained for 30 fps so other fps may generate weird results", minimum=10, maximum=60, value=30, step=1)
1153
 
@@ -1197,6 +1469,7 @@ with block:
1197
 
1198
  with gr.Accordion("Debug", open=False):
1199
  input_image_debug = gr.Image(type="numpy", label="Image Debug", height=320)
 
1200
  input_video_debug = gr.Video(sources='upload', label="Input Video Debug", height=320)
1201
  prompt_debug = gr.Textbox(label="Prompt Debug", value='')
1202
  total_second_length_debug = gr.Slider(label="Additional Video Length to Generate (seconds) Debug", minimum=1, maximum=120, value=1, step=0.1)
@@ -1208,8 +1481,7 @@ with block:
1208
  progress_desc = gr.Markdown('', elem_classes='no-generating-animation')
1209
  progress_bar = gr.HTML('', elem_classes='no-generating-animation')
1210
 
1211
- # 20250506 pftq: Updated inputs to include num_clean_frames
1212
- ips = [input_image, image_position, final_prompt, generation_mode, n_prompt, randomize_seed, seed, auto_allocation, allocation_time, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number]
1213
  ips_video = [input_video, final_prompt, n_prompt, randomize_seed, seed, auto_allocation, allocation_time, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch]
1214
 
1215
  with gr.Row(elem_id="text_examples", visible=False):
@@ -1219,9 +1491,10 @@ with block:
1219
  [
1220
  None, # input_image
1221
  0, # image_position
 
1222
  "Overcrowed street in Japan, photorealistic, realistic, intricate details, 8k, insanely detailed",
1223
  "text", # generation_mode
1224
- "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry, over-smooth", # n_prompt
1225
  True, # randomize_seed
1226
  42, # seed
1227
  True, # auto_allocation
@@ -1254,9 +1527,10 @@ with block:
1254
  [
1255
  "./img_examples/Example2.webp", # input_image
1256
  0, # image_position
 
1257
  "A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The man talks and the woman listens; A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks, the man stops talking and the man listens; A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks and the man listens",
1258
  "image", # generation_mode
1259
- "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry, over-smooth", # n_prompt
1260
  True, # randomize_seed
1261
  42, # seed
1262
  True, # auto_allocation
@@ -1277,9 +1551,10 @@ with block:
1277
  [
1278
  "./img_examples/Example1.png", # input_image
1279
  0, # image_position
 
1280
  "A dolphin emerges from the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
1281
  "image", # generation_mode
1282
- "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry, over-smooth", # n_prompt
1283
  True, # randomize_seed
1284
  42, # seed
1285
  True, # auto_allocation
@@ -1300,9 +1575,10 @@ with block:
1300
  [
1301
  "./img_examples/Example4.webp", # input_image
1302
  1, # image_position
 
1303
  "A building starting to explode, photorealistic, realisitc, 8k, insanely detailed",
1304
  "image", # generation_mode
1305
- "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry, over-smooth", # n_prompt
1306
  True, # randomize_seed
1307
  42, # seed
1308
  True, # auto_allocation
@@ -1323,9 +1599,10 @@ with block:
1323
  [
1324
  "./img_examples/Example4.webp", # input_image
1325
  50, # image_position
 
1326
  "A building starting to explode, photorealistic, realisitc, 8k, insanely detailed",
1327
  "image", # generation_mode
1328
- "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry, over-smooth", # n_prompt
1329
  True, # randomize_seed
1330
  42, # seed
1331
  True, # auto_allocation
@@ -1346,9 +1623,46 @@ with block:
1346
  [
1347
  "./img_examples/Example4.webp", # input_image
1348
  100, # image_position
 
1349
  "A building starting to explode, photorealistic, realisitc, 8k, insanely detailed",
1350
  "image", # generation_mode
1351
- "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry, over-smooth", # n_prompt
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1352
  True, # randomize_seed
1353
  42, # seed
1354
  True, # auto_allocation
@@ -1381,7 +1695,7 @@ with block:
1381
  [
1382
  "./img_examples/Example1.mp4", # input_video
1383
  "View of the sea as far as the eye can see, from the seaside, a piece of land is barely visible on the horizon at the middle, the sky is radiant, reflections of the sun in the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
1384
- "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry, over-smooth", # n_prompt
1385
  True, # randomize_seed
1386
  42, # seed
1387
  True, # auto_allocation
@@ -1405,7 +1719,7 @@ with block:
1405
  [
1406
  "./img_examples/Example1.mp4", # input_video
1407
  "View of the sea as far as the eye can see, from the seaside, a piece of land is barely visible on the horizon at the middle, the sky is radiant, reflections of the sun in the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
1408
- "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry, over-smooth", # n_prompt
1409
  True, # randomize_seed
1410
  42, # seed
1411
  True, # auto_allocation
@@ -1440,9 +1754,10 @@ with block:
1440
  [
1441
  None, # input_image
1442
  0, # image_position
 
1443
  "Overcrowed street in Japan, photorealistic, realistic, intricate details, 8k, insanely detailed",
1444
  "text", # generation_mode
1445
- "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry, over-smooth", # n_prompt
1446
  True, # randomize_seed
1447
  42, # seed
1448
  True, # auto_allocation
@@ -1474,9 +1789,10 @@ with block:
1474
  [
1475
  "./img_examples/Example1.png", # input_image
1476
  0, # image_position
 
1477
  "A dolphin emerges from the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
1478
  "image", # generation_mode
1479
- "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry, over-smooth", # n_prompt
1480
  True, # randomize_seed
1481
  42, # seed
1482
  True, # auto_allocation
@@ -1497,9 +1813,10 @@ with block:
1497
  [
1498
  "./img_examples/Example2.webp", # input_image
1499
  0, # image_position
 
1500
  "A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The man talks and the woman listens; A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks, the man stops talking and the man listens; A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks and the man listens",
1501
  "image", # generation_mode
1502
- "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry, over-smooth", # n_prompt
1503
  True, # randomize_seed
1504
  42, # seed
1505
  True, # auto_allocation
@@ -1520,9 +1837,10 @@ with block:
1520
  [
1521
  "./img_examples/Example2.webp", # input_image
1522
  0, # image_position
 
1523
  "A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks and the man listens; A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The man talks, the woman stops talking and the woman listens A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The man talks and the woman listens",
1524
  "image", # generation_mode
1525
- "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry, over-smooth", # n_prompt
1526
  True, # randomize_seed
1527
  42, # seed
1528
  True, # auto_allocation
@@ -1543,9 +1861,10 @@ with block:
1543
  [
1544
  "./img_examples/Example3.jpg", # input_image
1545
  0, # image_position
 
1546
  "A boy is walking to the right, full view, full-length view, cartoon",
1547
  "image", # generation_mode
1548
- "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry, over-smooth", # n_prompt
1549
  True, # randomize_seed
1550
  42, # seed
1551
  True, # auto_allocation
@@ -1566,9 +1885,10 @@ with block:
1566
  [
1567
  "./img_examples/Example4.webp", # input_image
1568
  100, # image_position
 
1569
  "A building starting to explode, photorealistic, realisitc, 8k, insanely detailed",
1570
  "image", # generation_mode
1571
- "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry, over-smooth", # n_prompt
1572
  True, # randomize_seed
1573
  42, # seed
1574
  True, # auto_allocation
@@ -1594,13 +1914,48 @@ with block:
1594
  cache_examples = False,
1595
  )
1596
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1597
  gr.Examples(
1598
  label = "🎥 Examples from video",
1599
  examples = [
1600
  [
1601
  "./img_examples/Example1.mp4", # input_video
1602
  "View of the sea as far as the eye can see, from the seaside, a piece of land is barely visible on the horizon at the middle, the sky is radiant, reflections of the sun in the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
1603
- "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry, over-smooth", # n_prompt
1604
  True, # randomize_seed
1605
  42, # seed
1606
  True, # auto_allocation
@@ -1651,42 +2006,106 @@ with block:
1651
 
1652
  def handle_generation_mode_change(generation_mode_data):
1653
  if generation_mode_data == "text":
1654
- return [gr.update(visible = True), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = True), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = True)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1655
  elif generation_mode_data == "image":
1656
- return [gr.update(visible = False), gr.update(visible = True), gr.update(visible = True), gr.update(visible = False), gr.update(visible = True), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = True)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1657
  elif generation_mode_data == "video":
1658
- return [gr.update(visible = False), gr.update(visible = False), gr.update(visible = False), gr.update(visible = True), gr.update(visible = False), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True), gr.update(visible = True), gr.update(visible = False)]
1659
-
1660
-
1661
- def handle_field_debug_change(input_image_debug_data, input_video_debug_data, prompt_debug_data, total_second_length_debug_data):
 
 
 
 
 
 
 
 
 
 
 
 
 
1662
  print("handle_field_debug_change")
1663
  input_image_debug_value[0] = input_image_debug_data
1664
  input_video_debug_value[0] = input_video_debug_data
 
1665
  prompt_debug_value[0] = prompt_debug_data
1666
  total_second_length_debug_value[0] = total_second_length_debug_data
1667
  return []
1668
 
1669
  input_image_debug.upload(
1670
  fn=handle_field_debug_change,
1671
- inputs=[input_image_debug, input_video_debug, prompt_debug, total_second_length_debug],
1672
  outputs=[]
1673
  )
1674
 
1675
  input_video_debug.upload(
1676
  fn=handle_field_debug_change,
1677
- inputs=[input_image_debug, input_video_debug, prompt_debug, total_second_length_debug],
 
 
 
 
 
 
1678
  outputs=[]
1679
  )
1680
 
1681
  prompt_debug.change(
1682
  fn=handle_field_debug_change,
1683
- inputs=[input_image_debug, input_video_debug, prompt_debug, total_second_length_debug],
1684
  outputs=[]
1685
  )
1686
 
1687
  total_second_length_debug.change(
1688
  fn=handle_field_debug_change,
1689
- inputs=[input_image_debug, input_video_debug, prompt_debug, total_second_length_debug],
1690
  outputs=[]
1691
  )
1692
 
@@ -1710,7 +2129,7 @@ with block:
1710
  generation_mode.change(
1711
  fn=handle_generation_mode_change,
1712
  inputs=[generation_mode],
1713
- outputs=[text_to_video_hint, image_position, input_image, input_video, start_button, start_button_video, no_resize, batch, num_clean_frames, vae_batch, prompt_hint, fps_number]
1714
  )
1715
 
1716
  # Update display when the page loads
@@ -1718,7 +2137,7 @@ with block:
1718
  fn=handle_generation_mode_change, inputs = [
1719
  generation_mode
1720
  ], outputs = [
1721
- text_to_video_hint, image_position, input_image, input_video, start_button, start_button_video, no_resize, batch, num_clean_frames, vae_batch, prompt_hint, fps_number
1722
  ]
1723
  )
1724
 
 
13
  import traceback
14
  import einops
15
  import safetensors.torch as sf
 
16
  import random
17
  import time
18
+ import numpy as np
19
+ import argparse
20
  import math
21
  # 20250506 pftq: Added for video input loading
22
  import decord
 
39
  from diffusers_helper.utils import save_bcthw_as_mp4, crop_or_pad_yield_mask, soft_append_bcthw, resize_and_center_crop, state_dict_weighted_merge, state_dict_offset_merge, generate_timestamp
40
  from diffusers_helper.models.hunyuan_video_packed import HunyuanVideoTransformer3DModelPacked
41
  from diffusers_helper.pipelines.k_diffusion_hunyuan import sample_hunyuan
42
+ from diffusers_helper.memory import cpu, gpu, get_cuda_free_memory_gb, move_model_to_device_with_memory_preservation, offload_model_from_device_for_memory_preservation, fake_diffusers_current_device, DynamicSwapInstaller, unload_complete_models, load_model_as_complete
 
43
  from diffusers_helper.thread_utils import AsyncStream, async_run
44
  from diffusers_helper.gradio.progress_bar import make_progress_bar_css, make_progress_bar_html
45
  from transformers import SiglipImageProcessor, SiglipVisionModel
46
  from diffusers_helper.clip_vision import hf_clip_vision_encode
47
  from diffusers_helper.bucket_tools import find_nearest_bucket
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
 
 
 
 
 
49
 
50
+ parser = argparse.ArgumentParser()
51
+ parser.add_argument('--share', action='store_true')
52
+ parser.add_argument("--server", type=str, default='0.0.0.0')
53
+ parser.add_argument("--port", type=int, required=False)
54
+ parser.add_argument("--inbrowser", action='store_true')
55
+ args = parser.parse_args()
56
+
57
+ # for win desktop probably use --server 127.0.0.1 --inbrowser
58
+ # For linux server probably use --server 127.0.0.1 or do not use any cmd flags
59
+ print(args)
60
+
61
+ free_mem_gb = get_cuda_free_memory_gb(gpu)
62
+ high_vram = free_mem_gb > 60
63
+
64
+ print(f'Free VRAM {free_mem_gb} GB')
65
+ print(f'High-VRAM Mode: {high_vram}')
66
+
67
+ text_encoder = LlamaModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder', torch_dtype=torch.float16).cpu()
68
+ text_encoder_2 = CLIPTextModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='text_encoder_2', torch_dtype=torch.float16).cpu()
69
+ tokenizer = LlamaTokenizerFast.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer')
70
+ tokenizer_2 = CLIPTokenizer.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='tokenizer_2')
71
+ vae = AutoencoderKLHunyuanVideo.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder='vae', torch_dtype=torch.float16).cpu()
72
+
73
+ feature_extractor = SiglipImageProcessor.from_pretrained("lllyasviel/flux_redux_bfl", subfolder='feature_extractor')
74
+ image_encoder = SiglipVisionModel.from_pretrained("lllyasviel/flux_redux_bfl", subfolder='image_encoder', torch_dtype=torch.float16).cpu()
75
+
76
+ transformer = HunyuanVideoTransformer3DModelPacked.from_pretrained('lllyasviel/FramePackI2V_HY', torch_dtype=torch.bfloat16).cpu()
77
+
78
+ vae.eval()
79
+ text_encoder.eval()
80
+ text_encoder_2.eval()
81
+ image_encoder.eval()
82
+ transformer.eval()
83
+
84
+ if not high_vram:
85
+ vae.enable_slicing()
86
+ vae.enable_tiling()
87
+
88
+ transformer.high_quality_fp32_output_for_inference = True
89
+ print('transformer.high_quality_fp32_output_for_inference = True')
90
+
91
+ transformer.to(dtype=torch.bfloat16)
92
+ vae.to(dtype=torch.float16)
93
+ image_encoder.to(dtype=torch.float16)
94
+ text_encoder.to(dtype=torch.float16)
95
+ text_encoder_2.to(dtype=torch.float16)
96
+
97
+ vae.requires_grad_(False)
98
+ text_encoder.requires_grad_(False)
99
+ text_encoder_2.requires_grad_(False)
100
+ image_encoder.requires_grad_(False)
101
+ transformer.requires_grad_(False)
102
+
103
+ if not high_vram:
104
+ # DynamicSwapInstaller is same as huggingface's enable_sequential_offload but 3x faster
105
+ DynamicSwapInstaller.install_model(transformer, device=gpu)
106
+ DynamicSwapInstaller.install_model(text_encoder, device=gpu)
107
+ else:
108
+ text_encoder.to(gpu)
109
+ text_encoder_2.to(gpu)
110
+ image_encoder.to(gpu)
111
+ vae.to(gpu)
112
+ transformer.to(gpu)
113
 
114
  stream = AsyncStream()
115
 
 
118
 
119
  input_image_debug_value = [None]
120
  input_video_debug_value = [None]
121
+ end_image_debug_value = [None]
122
  prompt_debug_value = [None]
123
  total_second_length_debug_value = [None]
124
 
 
313
  return False
314
 
315
  @torch.no_grad()
316
+ def worker(input_image, image_position, end_image, prompts, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf, fps_number):
317
  def encode_prompt(prompt, n_prompt):
318
  llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
319
 
 
582
  stream.output_queue.push(('end', None))
583
  return
584
 
585
+ @torch.no_grad()
586
+ def worker_start_end(input_image, image_position, end_image, prompts, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf, fps_number):
587
+ def encode_prompt(prompt, n_prompt):
588
+ llama_vec, clip_l_pooler = encode_prompt_conds(prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
589
+
590
+ if cfg == 1:
591
+ llama_vec_n, clip_l_pooler_n = torch.zeros_like(llama_vec), torch.zeros_like(clip_l_pooler)
592
+ else:
593
+ llama_vec_n, clip_l_pooler_n = encode_prompt_conds(n_prompt, text_encoder, text_encoder_2, tokenizer, tokenizer_2)
594
+
595
+ llama_vec, llama_attention_mask = crop_or_pad_yield_mask(llama_vec, length=512)
596
+ llama_vec_n, llama_attention_mask_n = crop_or_pad_yield_mask(llama_vec_n, length=512)
597
+
598
+ llama_vec = llama_vec.to(transformer.dtype)
599
+ llama_vec_n = llama_vec_n.to(transformer.dtype)
600
+ clip_l_pooler = clip_l_pooler.to(transformer.dtype)
601
+ clip_l_pooler_n = clip_l_pooler_n.to(transformer.dtype)
602
+ return [llama_vec, clip_l_pooler, llama_vec_n, clip_l_pooler_n, llama_attention_mask, llama_attention_mask_n]
603
+
604
+ total_latent_sections = (total_second_length * fps_number) / (latent_window_size * 4)
605
+ total_latent_sections = int(max(round(total_latent_sections), 1))
606
+
607
+ job_id = generate_timestamp()
608
+
609
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Starting ...'))))
610
+
611
+ try:
612
+ # Clean GPU
613
+ if not high_vram:
614
+ unload_complete_models(
615
+ text_encoder, text_encoder_2, image_encoder, vae, transformer
616
+ )
617
+
618
+ # Text encoding
619
+
620
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Text encoding ...'))))
621
+
622
+ if not high_vram:
623
+ fake_diffusers_current_device(text_encoder, gpu) # since we only encode one text - that is one model move and one encode, offload is same time consumption since it is also one load and one encode.
624
+ load_model_as_complete(text_encoder_2, target_device=gpu)
625
+
626
+
627
+ prompt_parameters = []
628
+
629
+ for prompt_part in prompts[:total_latent_sections]:
630
+ prompt_parameters.append(encode_prompt(prompt_part, n_prompt))
631
+
632
+ # Clean GPU
633
+ if not high_vram:
634
+ unload_complete_models(
635
+ text_encoder, text_encoder_2
636
+ )
637
+
638
+ # Processing input image (start frame)
639
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Processing start frame ...'))))
640
+
641
+ H, W, C = input_image.shape
642
+ height, width = find_nearest_bucket(H, W, resolution=640)
643
+ input_image_np = resize_and_center_crop(input_image, target_width=width, target_height=height)
644
+
645
+ Image.fromarray(input_image_np).save(os.path.join(outputs_folder, f'{job_id}_start.png'))
646
+
647
+ input_image_pt = torch.from_numpy(input_image_np).float() / 127.5 - 1
648
+ input_image_pt = input_image_pt.permute(2, 0, 1)[None, :, None]
649
+
650
+ # Processing end image (if provided)
651
+ has_end_image = end_image is not None
652
+ if has_end_image:
653
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Processing end frame ...'))))
654
+
655
+ H_end, W_end, C_end = end_image.shape
656
+ end_image_np = resize_and_center_crop(end_image, target_width=width, target_height=height)
657
+
658
+ Image.fromarray(end_image_np).save(os.path.join(outputs_folder, f'{job_id}_end.png'))
659
+
660
+ end_image_pt = torch.from_numpy(end_image_np).float() / 127.5 - 1
661
+ end_image_pt = end_image_pt.permute(2, 0, 1)[None, :, None]
662
+
663
+ # VAE encoding
664
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'VAE encoding ...'))))
665
+
666
+ if not high_vram:
667
+ load_model_as_complete(vae, target_device=gpu)
668
+
669
+ start_latent = vae_encode(input_image_pt, vae)
670
+
671
+ if has_end_image:
672
+ end_latent = vae_encode(end_image_pt, vae)
673
+
674
+ # CLIP Vision
675
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'CLIP Vision encoding ...'))))
676
+
677
+ if not high_vram:
678
+ load_model_as_complete(image_encoder, target_device=gpu)
679
+
680
+ image_encoder_output = hf_clip_vision_encode(input_image_np, feature_extractor, image_encoder)
681
+ image_encoder_last_hidden_state = image_encoder_output.last_hidden_state
682
+
683
+ if has_end_image:
684
+ end_image_encoder_output = hf_clip_vision_encode(end_image_np, feature_extractor, image_encoder)
685
+ end_image_encoder_last_hidden_state = end_image_encoder_output.last_hidden_state
686
+ # Combine both image embeddings or use a weighted approach
687
+ image_encoder_last_hidden_state = (image_encoder_last_hidden_state + end_image_encoder_last_hidden_state) / 2
688
+
689
+ # Clean GPU
690
+ if not high_vram:
691
+ unload_complete_models(
692
+ image_encoder
693
+ )
694
+
695
+ # Dtype
696
+ image_encoder_last_hidden_state = image_encoder_last_hidden_state.to(transformer.dtype)
697
+
698
+ # Sampling
699
+ stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'Start sampling ...'))))
700
+
701
+ rnd = torch.Generator("cpu").manual_seed(seed)
702
+ num_frames = latent_window_size * 4 - 3
703
+
704
+ history_latents = torch.zeros(size=(1, 16, 1 + 2 + 16, height // 8, width // 8), dtype=torch.float32, device=cpu)
705
+ history_pixels = None
706
+ total_generated_latent_frames = 0
707
+
708
+ # 将迭代器转换为列表
709
+ latent_paddings = list(reversed(range(total_latent_sections)))
710
+
711
+ if total_latent_sections > 4:
712
+ # In theory the latent_paddings should follow the above sequence, but it seems that duplicating some
713
+ # items looks better than expanding it when total_latent_sections > 4
714
+ # One can try to remove below trick and just
715
+ # use `latent_paddings = list(reversed(range(total_latent_sections)))` to compare
716
+ latent_paddings = [3] + [2] * (total_latent_sections - 3) + [1, 0]
717
+
718
+ for latent_padding in latent_paddings:
719
+ is_last_section = latent_padding == 0
720
+ is_first_section = latent_padding == latent_paddings[0]
721
+ latent_padding_size = latent_padding * latent_window_size
722
+
723
+ if stream.input_queue.top() == 'end':
724
+ stream.output_queue.push(('end', None))
725
+ return
726
+
727
+ print(f'latent_padding_size = {latent_padding_size}, is_last_section = {is_last_section}, is_first_section = {is_first_section}')
728
+
729
+ if len(prompt_parameters) > 0:
730
+ [llama_vec, clip_l_pooler, llama_vec_n, clip_l_pooler_n, llama_attention_mask, llama_attention_mask_n] = prompt_parameters.pop(len(prompt_parameters) - 1)
731
+
732
+ indices = torch.arange(0, sum([1, latent_padding_size, latent_window_size, 1, 2, 16])).unsqueeze(0)
733
+ clean_latent_indices_pre, blank_indices, latent_indices, clean_latent_indices_post, clean_latent_2x_indices, clean_latent_4x_indices = indices.split([1, latent_padding_size, latent_window_size, 1, 2, 16], dim=1)
734
+ clean_latent_indices = torch.cat([clean_latent_indices_pre, clean_latent_indices_post], dim=1)
735
+
736
+ clean_latents_pre = start_latent.to(history_latents)
737
+ clean_latents_post, clean_latents_2x, clean_latents_4x = history_latents[:, :, :1 + 2 + 16, :, :].split([1, 2, 16], dim=2)
738
+ clean_latents = torch.cat([clean_latents_pre, clean_latents_post], dim=2)
739
+
740
+ # Use end image latent for the first section if provided
741
+ if has_end_image and is_first_section:
742
+ clean_latents_post = end_latent.to(history_latents)
743
+ clean_latents = torch.cat([clean_latents_pre, clean_latents_post], dim=2)
744
+
745
+ if not high_vram:
746
+ unload_complete_models()
747
+ move_model_to_device_with_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=gpu_memory_preservation)
748
+
749
+ if use_teacache:
750
+ transformer.initialize_teacache(enable_teacache=True, num_steps=steps)
751
+ else:
752
+ transformer.initialize_teacache(enable_teacache=False)
753
+
754
+ def callback(d):
755
+ preview = d['denoised']
756
+ preview = vae_decode_fake(preview)
757
+
758
+ preview = (preview * 255.0).detach().cpu().numpy().clip(0, 255).astype(np.uint8)
759
+ preview = einops.rearrange(preview, 'b c t h w -> (b h) (t w) c')
760
+
761
+ if stream.input_queue.top() == 'end':
762
+ stream.output_queue.push(('end', None))
763
+ raise KeyboardInterrupt('User ends the task.')
764
+
765
+ current_step = d['i'] + 1
766
+ percentage = int(100.0 * current_step / steps)
767
+ hint = f'Sampling {current_step}/{steps}'
768
+ desc = f'Total generated frames: {int(max(0, total_generated_latent_frames * 4 - 3))}, Video length: {max(0, (total_generated_latent_frames * 4 - 3) / fps_number) :.2f} seconds (FPS-30). The video is being extended now ...'
769
+ stream.output_queue.push(('progress', (preview, desc, make_progress_bar_html(percentage, hint))))
770
+ return
771
+
772
+ generated_latents = sample_hunyuan(
773
+ transformer=transformer,
774
+ sampler='unipc',
775
+ width=width,
776
+ height=height,
777
+ frames=num_frames,
778
+ real_guidance_scale=cfg,
779
+ distilled_guidance_scale=gs,
780
+ guidance_rescale=rs,
781
+ # shift=3.0,
782
+ num_inference_steps=steps,
783
+ generator=rnd,
784
+ prompt_embeds=llama_vec,
785
+ prompt_embeds_mask=llama_attention_mask,
786
+ prompt_poolers=clip_l_pooler,
787
+ negative_prompt_embeds=llama_vec_n,
788
+ negative_prompt_embeds_mask=llama_attention_mask_n,
789
+ negative_prompt_poolers=clip_l_pooler_n,
790
+ device=gpu,
791
+ dtype=torch.bfloat16,
792
+ image_embeddings=image_encoder_last_hidden_state,
793
+ latent_indices=latent_indices,
794
+ clean_latents=clean_latents,
795
+ clean_latent_indices=clean_latent_indices,
796
+ clean_latents_2x=clean_latents_2x,
797
+ clean_latent_2x_indices=clean_latent_2x_indices,
798
+ clean_latents_4x=clean_latents_4x,
799
+ clean_latent_4x_indices=clean_latent_4x_indices,
800
+ callback=callback,
801
+ )
802
+
803
+ if is_last_section:
804
+ generated_latents = torch.cat([start_latent.to(generated_latents), generated_latents], dim=2)
805
+
806
+ total_generated_latent_frames += int(generated_latents.shape[2])
807
+ history_latents = torch.cat([generated_latents.to(history_latents), history_latents], dim=2)
808
+
809
+ if not high_vram:
810
+ offload_model_from_device_for_memory_preservation(transformer, target_device=gpu, preserved_memory_gb=8)
811
+ load_model_as_complete(vae, target_device=gpu)
812
+
813
+ real_history_latents = history_latents[:, :, :total_generated_latent_frames, :, :]
814
+
815
+ if history_pixels is None:
816
+ history_pixels = vae_decode(real_history_latents, vae).cpu()
817
+ else:
818
+ section_latent_frames = (latent_window_size * 2 + 1) if is_last_section else (latent_window_size * 2)
819
+ overlapped_frames = latent_window_size * 4 - 3
820
+
821
+ current_pixels = vae_decode(real_history_latents[:, :, :section_latent_frames], vae).cpu()
822
+ history_pixels = soft_append_bcthw(current_pixels, history_pixels, overlapped_frames)
823
+
824
+ if not high_vram:
825
+ unload_complete_models(vae)
826
+
827
+ output_filename = os.path.join(outputs_folder, f'{job_id}_{total_generated_latent_frames}.mp4')
828
+
829
+ save_bcthw_as_mp4(history_pixels, output_filename, fps=fps_number, crf=mp4_crf)
830
+
831
+ print(f'Decoded. Current latent shape {real_history_latents.shape}; pixel shape {history_pixels.shape}')
832
+
833
+ stream.output_queue.push(('file', output_filename))
834
+
835
+ if is_last_section:
836
+ break
837
+ except:
838
+ traceback.print_exc()
839
+
840
+ if not high_vram:
841
+ unload_complete_models(
842
+ text_encoder, text_encoder_2, image_encoder, vae, transformer
843
+ )
844
+
845
+ stream.output_queue.push(('end', None))
846
+ return
847
+
848
  # 20250506 pftq: Modified worker to accept video input and clean frame count
849
  @torch.no_grad()
850
  def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch):
 
1125
  stream.output_queue.push(('end', None))
1126
  return
1127
 
1128
+ def get_duration(input_image, image_position, end_image, prompts, generation_mode, n_prompt, seed, resolution, total_second_length, allocation_time, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number):
1129
  return allocation_time
1130
 
1131
  # Remove this decorator if you run on local
1132
  @spaces.GPU(duration=get_duration)
1133
+ def process_on_gpu(input_image, image_position, end_image, prompts, generation_mode, n_prompt, seed, resolution, total_second_length, allocation_time, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number
1134
  ):
1135
  start = time.time()
1136
  global stream
1137
  stream = AsyncStream()
1138
 
1139
+ async_run(worker_start_end if generation_mode == "start_end" else worker, input_image, image_position, end_image, prompts, n_prompt, seed, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, use_teacache, mp4_crf, fps_number)
1140
 
1141
  output_filename = None
1142
 
 
1167
 
1168
  def process(input_image,
1169
  image_position=0,
1170
+ end_image=None,
1171
  prompt="",
1172
  generation_mode="image",
1173
  n_prompt="",
 
1178
  resolution=640,
1179
  total_second_length=5,
1180
  latent_window_size=9,
1181
+ steps=30,
1182
  cfg=1.0,
1183
  gs=10.0,
1184
  rs=0.0,
1185
  gpu_memory_preservation=6,
1186
+ enable_preview=False,
1187
  use_teacache=False,
1188
  mp4_crf=16,
1189
  fps_number=30
 
1191
  if auto_allocation:
1192
  allocation_time = min(total_second_length * 60 * (1.5 if use_teacache else 3.0) * (1 + ((steps - 25) / 25))**2, 600)
1193
 
1194
+ if input_image_debug_value[0] is not None or end_image_debug_value[0] is not None or prompt_debug_value[0] is not None or total_second_length_debug_value[0] is not None:
1195
  input_image = input_image_debug_value[0]
1196
+ end_image = end_image_debug_value[0]
1197
  prompt = prompt_debug_value[0]
1198
  total_second_length = total_second_length_debug_value[0]
1199
  allocation_time = min(total_second_length_debug_value[0] * 60 * 100, 600)
1200
+ input_image_debug_value[0] = end_image_debug_value[0] = input_video_debug_value[0] = prompt_debug_value[0] = total_second_length_debug_value[0] = None
1201
 
1202
  if torch.cuda.device_count() == 0:
1203
  gr.Warning('Set this space to GPU config to make it work.')
 
1219
 
1220
  yield from process_on_gpu(input_image,
1221
  image_position,
1222
+ end_image,
1223
  prompts,
1224
  generation_mode,
1225
  n_prompt,
 
1290
  prompt = prompt_debug_value[0]
1291
  total_second_length = total_second_length_debug_value[0]
1292
  allocation_time = min(total_second_length_debug_value[0] * 60 * 100, 600)
1293
+ input_image_debug_value[0] = end_image_debug_value[0] = input_video_debug_value[0] = prompt_debug_value[0] = total_second_length_debug_value[0] = None
1294
 
1295
  if torch.cuda.device_count() == 0:
1296
  gr.Warning('Set this space to GPU config to make it work.')
 
1390
  local_storage = gr.BrowserState(default_local_storage)
1391
  with gr.Row():
1392
  with gr.Column():
1393
+ generation_mode = gr.Radio([["Text-to-Video", "text"], ["Image-to-Video", "image"], ["Start & end frames", "start_end"], ["Video Extension", "video"]], elem_id="generation-mode", label="Generation mode", value = "image")
1394
  text_to_video_hint = gr.HTML("Text-to-Video badly works with a flash effect at the start. I discourage to use the Text-to-Video feature. You should rather generate an image with Flux and use Image-to-Video. You will save time.")
1395
  input_image = gr.Image(sources='upload', type="numpy", label="Image", height=320)
1396
+ end_image = gr.Image(sources='upload', type="numpy", label="End Frame (Optional)", height=320)
1397
  image_position = gr.Slider(label="Image position", minimum=0, maximum=100, value=0, step=1, info='0=Video start; 100=Video end (lower quality)')
1398
  input_video = gr.Video(sources='upload', label="Input Video", height=320)
1399
  timeless_prompt = gr.Textbox(label="Timeless prompt", info='Used on the whole duration of the generation', value='', placeholder="The creature starts to move, fast motion, fixed camera, focus motion, consistent arm, consistent position, mute colors, insanely detailed")
 
1419
  enable_preview = gr.Checkbox(label='Enable preview', value=True, info='Display a preview around each second generated but it costs 2 sec. for each second generated.')
1420
  use_teacache = gr.Checkbox(label='Use TeaCache', value=False, info='Faster speed and no break in brightness, but often makes hands and fingers slightly worse.')
1421
 
1422
+ n_prompt = gr.Textbox(label="Negative Prompt", value="Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth", info='Requires using normal CFG (undistilled) instead of Distilled (set Distilled=1 and CFG > 1).')
1423
 
1424
  fps_number = gr.Slider(label="Frame per seconds", info="The model is trained for 30 fps so other fps may generate weird results", minimum=10, maximum=60, value=30, step=1)
1425
 
 
1469
 
1470
  with gr.Accordion("Debug", open=False):
1471
  input_image_debug = gr.Image(type="numpy", label="Image Debug", height=320)
1472
+ end_image_debug = gr.Image(type="numpy", label="End Image Debug", height=320)
1473
  input_video_debug = gr.Video(sources='upload', label="Input Video Debug", height=320)
1474
  prompt_debug = gr.Textbox(label="Prompt Debug", value='')
1475
  total_second_length_debug = gr.Slider(label="Additional Video Length to Generate (seconds) Debug", minimum=1, maximum=120, value=1, step=0.1)
 
1481
  progress_desc = gr.Markdown('', elem_classes='no-generating-animation')
1482
  progress_bar = gr.HTML('', elem_classes='no-generating-animation')
1483
 
1484
+ ips = [input_image, image_position, end_image, final_prompt, generation_mode, n_prompt, randomize_seed, seed, auto_allocation, allocation_time, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, mp4_crf, fps_number]
 
1485
  ips_video = [input_video, final_prompt, n_prompt, randomize_seed, seed, auto_allocation, allocation_time, batch, resolution, total_second_length, latent_window_size, steps, cfg, gs, rs, gpu_memory_preservation, enable_preview, use_teacache, no_resize, mp4_crf, num_clean_frames, vae_batch]
1486
 
1487
  with gr.Row(elem_id="text_examples", visible=False):
 
1491
  [
1492
  None, # input_image
1493
  0, # image_position
1494
+ None, # end_image
1495
  "Overcrowed street in Japan, photorealistic, realistic, intricate details, 8k, insanely detailed",
1496
  "text", # generation_mode
1497
+ "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth", # n_prompt
1498
  True, # randomize_seed
1499
  42, # seed
1500
  True, # auto_allocation
 
1527
  [
1528
  "./img_examples/Example2.webp", # input_image
1529
  0, # image_position
1530
+ None, # end_image
1531
  "A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The man talks and the woman listens; A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks, the man stops talking and the man listens; A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks and the man listens",
1532
  "image", # generation_mode
1533
+ "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth", # n_prompt
1534
  True, # randomize_seed
1535
  42, # seed
1536
  True, # auto_allocation
 
1551
  [
1552
  "./img_examples/Example1.png", # input_image
1553
  0, # image_position
1554
+ None, # end_image
1555
  "A dolphin emerges from the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
1556
  "image", # generation_mode
1557
+ "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth", # n_prompt
1558
  True, # randomize_seed
1559
  42, # seed
1560
  True, # auto_allocation
 
1575
  [
1576
  "./img_examples/Example4.webp", # input_image
1577
  1, # image_position
1578
+ None, # end_image
1579
  "A building starting to explode, photorealistic, realisitc, 8k, insanely detailed",
1580
  "image", # generation_mode
1581
+ "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth", # n_prompt
1582
  True, # randomize_seed
1583
  42, # seed
1584
  True, # auto_allocation
 
1599
  [
1600
  "./img_examples/Example4.webp", # input_image
1601
  50, # image_position
1602
+ None, # end_image
1603
  "A building starting to explode, photorealistic, realisitc, 8k, insanely detailed",
1604
  "image", # generation_mode
1605
+ "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth", # n_prompt
1606
  True, # randomize_seed
1607
  42, # seed
1608
  True, # auto_allocation
 
1623
  [
1624
  "./img_examples/Example4.webp", # input_image
1625
  100, # image_position
1626
+ None, # end_image
1627
  "A building starting to explode, photorealistic, realisitc, 8k, insanely detailed",
1628
  "image", # generation_mode
1629
+ "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth", # n_prompt
1630
+ True, # randomize_seed
1631
+ 42, # seed
1632
+ True, # auto_allocation
1633
+ 180, # allocation_time
1634
+ 672, # resolution
1635
+ 1, # total_second_length
1636
+ 9, # latent_window_size
1637
+ 30, # steps
1638
+ 1.0, # cfg
1639
+ 10.0, # gs
1640
+ 0.0, # rs
1641
+ 6, # gpu_memory_preservation
1642
+ False, # enable_preview
1643
+ False, # use_teacache
1644
+ 16, # mp4_crf
1645
+ 30 # fps_number
1646
+ ],
1647
+ ],
1648
+ run_on_click = True,
1649
+ fn = process,
1650
+ inputs = ips,
1651
+ outputs = [result_video, preview_image, progress_desc, progress_bar, start_button, end_button, warning],
1652
+ cache_examples = torch.cuda.device_count() > 0,
1653
+ )
1654
+
1655
+ with gr.Row(elem_id="start_end_examples", visible=False):
1656
+ gr.Examples(
1657
+ label = "Examples from start and end frames",
1658
+ examples = [
1659
+ [
1660
+ "./img_examples/Example2.webp", # input_image
1661
+ 0, # image_position
1662
+ None, # end_image
1663
+ "A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The man talks and the woman listens; A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks, the man stops talking and the man listens; A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks and the man listens",
1664
+ "start_end", # generation_mode
1665
+ "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth", # n_prompt
1666
  True, # randomize_seed
1667
  42, # seed
1668
  True, # auto_allocation
 
1695
  [
1696
  "./img_examples/Example1.mp4", # input_video
1697
  "View of the sea as far as the eye can see, from the seaside, a piece of land is barely visible on the horizon at the middle, the sky is radiant, reflections of the sun in the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
1698
+ "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth", # n_prompt
1699
  True, # randomize_seed
1700
  42, # seed
1701
  True, # auto_allocation
 
1719
  [
1720
  "./img_examples/Example1.mp4", # input_video
1721
  "View of the sea as far as the eye can see, from the seaside, a piece of land is barely visible on the horizon at the middle, the sky is radiant, reflections of the sun in the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
1722
+ "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth", # n_prompt
1723
  True, # randomize_seed
1724
  42, # seed
1725
  True, # auto_allocation
 
1754
  [
1755
  None, # input_image
1756
  0, # image_position
1757
+ None, # end_image
1758
  "Overcrowed street in Japan, photorealistic, realistic, intricate details, 8k, insanely detailed",
1759
  "text", # generation_mode
1760
+ "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth", # n_prompt
1761
  True, # randomize_seed
1762
  42, # seed
1763
  True, # auto_allocation
 
1789
  [
1790
  "./img_examples/Example1.png", # input_image
1791
  0, # image_position
1792
+ None, # end_image
1793
  "A dolphin emerges from the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
1794
  "image", # generation_mode
1795
+ "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth", # n_prompt
1796
  True, # randomize_seed
1797
  42, # seed
1798
  True, # auto_allocation
 
1813
  [
1814
  "./img_examples/Example2.webp", # input_image
1815
  0, # image_position
1816
+ None, # end_image
1817
  "A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The man talks and the woman listens; A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks, the man stops talking and the man listens; A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks and the man listens",
1818
  "image", # generation_mode
1819
+ "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth", # n_prompt
1820
  True, # randomize_seed
1821
  42, # seed
1822
  True, # auto_allocation
 
1837
  [
1838
  "./img_examples/Example2.webp", # input_image
1839
  0, # image_position
1840
+ None, # end_image
1841
  "A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks and the man listens; A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The man talks, the woman stops talking and the woman listens A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The man talks and the woman listens",
1842
  "image", # generation_mode
1843
+ "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth", # n_prompt
1844
  True, # randomize_seed
1845
  42, # seed
1846
  True, # auto_allocation
 
1861
  [
1862
  "./img_examples/Example3.jpg", # input_image
1863
  0, # image_position
1864
+ None, # end_image
1865
  "A boy is walking to the right, full view, full-length view, cartoon",
1866
  "image", # generation_mode
1867
+ "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth", # n_prompt
1868
  True, # randomize_seed
1869
  42, # seed
1870
  True, # auto_allocation
 
1885
  [
1886
  "./img_examples/Example4.webp", # input_image
1887
  100, # image_position
1888
+ None, # end_image
1889
  "A building starting to explode, photorealistic, realisitc, 8k, insanely detailed",
1890
  "image", # generation_mode
1891
+ "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth", # n_prompt
1892
  True, # randomize_seed
1893
  42, # seed
1894
  True, # auto_allocation
 
1914
  cache_examples = False,
1915
  )
1916
 
1917
+ gr.Examples(
1918
+ label = "🖼️ Examples from start and end frames",
1919
+ examples = [
1920
+ [
1921
+ "./img_examples/Example1.png", # input_image
1922
+ 0, # image_position
1923
+ None, # end_image
1924
+ "A dolphin emerges from the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
1925
+ "start_end", # generation_mode
1926
+ "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth", # n_prompt
1927
+ True, # randomize_seed
1928
+ 42, # seed
1929
+ True, # auto_allocation
1930
+ 180, # allocation_time
1931
+ 672, # resolution
1932
+ 1, # total_second_length
1933
+ 9, # latent_window_size
1934
+ 30, # steps
1935
+ 1.0, # cfg
1936
+ 10.0, # gs
1937
+ 0.0, # rs
1938
+ 6, # gpu_memory_preservation
1939
+ False, # enable_preview
1940
+ True, # use_teacache
1941
+ 16, # mp4_crf
1942
+ 30 # fps_number
1943
+ ],
1944
+ ],
1945
+ run_on_click = True,
1946
+ fn = process,
1947
+ inputs = ips,
1948
+ outputs = [result_video, preview_image, progress_desc, progress_bar, start_button, end_button, warning],
1949
+ cache_examples = False,
1950
+ )
1951
+
1952
  gr.Examples(
1953
  label = "🎥 Examples from video",
1954
  examples = [
1955
  [
1956
  "./img_examples/Example1.mp4", # input_video
1957
  "View of the sea as far as the eye can see, from the seaside, a piece of land is barely visible on the horizon at the middle, the sky is radiant, reflections of the sun in the water, photorealistic, realistic, intricate details, 8k, insanely detailed",
1958
+ "Missing arm, long hand, unrealistic position, impossible contortion, visible bone, muscle contraction, poorly framed, blurred, blurry, over-smooth", # n_prompt
1959
  True, # randomize_seed
1960
  42, # seed
1961
  True, # auto_allocation
 
2006
 
2007
  def handle_generation_mode_change(generation_mode_data):
2008
  if generation_mode_data == "text":
2009
+ return [
2010
+ gr.update(visible = True), # text_to_video_hint
2011
+ gr.update(visible = False), # image_position
2012
+ gr.update(visible = False), # input_image
2013
+ gr.update(visible = False), # end_image
2014
+ gr.update(visible = False), # input_video
2015
+ gr.update(visible = True), # start_button
2016
+ gr.update(visible = False), # start_button_video
2017
+ gr.update(visible = False), # no_resize
2018
+ gr.update(visible = False), # batch
2019
+ gr.update(visible = False), # num_clean_frames
2020
+ gr.update(visible = False), # vae_batch
2021
+ gr.update(visible = False), # prompt_hint
2022
+ gr.update(visible = True) # fps_number
2023
+ ]
2024
  elif generation_mode_data == "image":
2025
+ return [
2026
+ gr.update(visible = False), # text_to_video_hint
2027
+ gr.update(visible = True), # image_position
2028
+ gr.update(visible = True), # input_image
2029
+ gr.update(visible = False), # end_image
2030
+ gr.update(visible = False), # input_video
2031
+ gr.update(visible = True), # start_button
2032
+ gr.update(visible = False), # start_button_video
2033
+ gr.update(visible = False), # no_resize
2034
+ gr.update(visible = False), # batch
2035
+ gr.update(visible = False), # num_clean_frames
2036
+ gr.update(visible = False), # vae_batch
2037
+ gr.update(visible = False), # prompt_hint
2038
+ gr.update(visible = True) # fps_number
2039
+ ]
2040
+ elif generation_mode_data == "start_end":
2041
+ return [
2042
+ gr.update(visible = False), # text_to_video_hint
2043
+ gr.update(visible = False), # image_position
2044
+ gr.update(visible = True), # input_image
2045
+ gr.update(visible = True), # end_image
2046
+ gr.update(visible = False), # input_video
2047
+ gr.update(visible = True), # start_button
2048
+ gr.update(visible = False), # start_button_video
2049
+ gr.update(visible = False), # no_resize
2050
+ gr.update(visible = False), # batch
2051
+ gr.update(visible = False), # num_clean_frames
2052
+ gr.update(visible = False), # vae_batch
2053
+ gr.update(visible = False), # prompt_hint
2054
+ gr.update(visible = True) # fps_number
2055
+ ]
2056
  elif generation_mode_data == "video":
2057
+ return [
2058
+ gr.update(visible = False), # text_to_video_hint
2059
+ gr.update(visible = False), # image_position
2060
+ gr.update(visible = False), # input_image
2061
+ gr.update(visible = False), # end_image
2062
+ gr.update(visible = True), # input_video
2063
+ gr.update(visible = False), # start_button
2064
+ gr.update(visible = True), # start_button_video
2065
+ gr.update(visible = True), # no_resize
2066
+ gr.update(visible = True), # batch
2067
+ gr.update(visible = True), # num_clean_frames
2068
+ gr.update(visible = True), # vae_batch
2069
+ gr.update(visible = True), # prompt_hint
2070
+ gr.update(visible = False) # fps_number
2071
+ ]
2072
+
2073
+ def handle_field_debug_change(input_image_debug_data, input_video_debug_data, end_image_debug_data, prompt_debug_data, total_second_length_debug_data):
2074
  print("handle_field_debug_change")
2075
  input_image_debug_value[0] = input_image_debug_data
2076
  input_video_debug_value[0] = input_video_debug_data
2077
+ end_image_debug_value[0] = end_image_debug_data
2078
  prompt_debug_value[0] = prompt_debug_data
2079
  total_second_length_debug_value[0] = total_second_length_debug_data
2080
  return []
2081
 
2082
  input_image_debug.upload(
2083
  fn=handle_field_debug_change,
2084
+ inputs=[input_image_debug, input_video_debug, end_image_debug, prompt_debug, total_second_length_debug],
2085
  outputs=[]
2086
  )
2087
 
2088
  input_video_debug.upload(
2089
  fn=handle_field_debug_change,
2090
+ inputs=[input_image_debug, input_video_debug, end_image_debug, prompt_debug, total_second_length_debug],
2091
+ outputs=[]
2092
+ )
2093
+
2094
+ end_image_debug.upload(
2095
+ fn=handle_field_debug_change,
2096
+ inputs=[input_image_debug, input_video_debug, end_image_debug, prompt_debug, total_second_length_debug],
2097
  outputs=[]
2098
  )
2099
 
2100
  prompt_debug.change(
2101
  fn=handle_field_debug_change,
2102
+ inputs=[input_image_debug, input_video_debug, end_image_debug, prompt_debug, total_second_length_debug],
2103
  outputs=[]
2104
  )
2105
 
2106
  total_second_length_debug.change(
2107
  fn=handle_field_debug_change,
2108
+ inputs=[input_image_debug, input_video_debug, end_image_debug, prompt_debug, total_second_length_debug],
2109
  outputs=[]
2110
  )
2111
 
 
2129
  generation_mode.change(
2130
  fn=handle_generation_mode_change,
2131
  inputs=[generation_mode],
2132
+ outputs=[text_to_video_hint, image_position, input_image, end_image, input_video, start_button, start_button_video, no_resize, batch, num_clean_frames, vae_batch, prompt_hint, fps_number]
2133
  )
2134
 
2135
  # Update display when the page loads
 
2137
  fn=handle_generation_mode_change, inputs = [
2138
  generation_mode
2139
  ], outputs = [
2140
+ text_to_video_hint, image_position, input_image, end_image, input_video, start_button, start_button_video, no_resize, batch, num_clean_frames, vae_batch, prompt_hint, fps_number
2141
  ]
2142
  )
2143