Fabrice-TIERCELIN commited on
Commit
cbfbd85
·
verified ·
1 Parent(s): 55436b0

Optimization

Browse files
Files changed (1) hide show
  1. app.py +12 -9
app.py CHANGED
@@ -399,9 +399,10 @@ def worker(input_image, prompts, n_prompt, seed, resolution, total_second_length
399
  rnd = torch.Generator("cpu").manual_seed(seed)
400
 
401
  history_latents = torch.zeros(size=(1, 16, 16 + 2 + 1, height // 8, width // 8), dtype=torch.float32).cpu()
 
402
  history_pixels = None
403
 
404
- history_latents = torch.cat([history_latents, start_latent.to(history_latents)], dim=2)
405
  total_generated_latent_frames = 1
406
 
407
  if enable_preview:
@@ -481,7 +482,7 @@ def worker(input_image, prompts, n_prompt, seed, resolution, total_second_length
481
  transformer.initialize_teacache(enable_teacache=False)
482
 
483
  clean_latents_4x, clean_latents_2x, clean_latents_1x = history_latents[:, :, -sum([16, 2, 1]):, :, :].split([16, 2, 1], dim=2)
484
- clean_latents = torch.cat([start_latent.to(history_latents), clean_latents_1x], dim=2)
485
 
486
  generated_latents = sample_hunyuan(
487
  transformer=transformer,
@@ -620,9 +621,10 @@ def worker_last_frame(input_image, prompts, n_prompt, seed, resolution, total_se
620
  rnd = torch.Generator("cpu").manual_seed(seed)
621
 
622
  history_latents = torch.zeros(size=(1, 16, 16 + 2 + 1, height // 8, width // 8), dtype=torch.float32).cpu()
 
623
  history_pixels = None
624
 
625
- history_latents = torch.cat([start_latent.to(history_latents), history_latents], dim=2)
626
  total_generated_latent_frames = 1
627
 
628
  if enable_preview:
@@ -702,7 +704,7 @@ def worker_last_frame(input_image, prompts, n_prompt, seed, resolution, total_se
702
  transformer.initialize_teacache(enable_teacache=False)
703
 
704
  clean_latents_1x, clean_latents_2x, clean_latents_4x = history_latents[:, :, :sum([1, 2, 16]), :, :].split([1, 2, 16], dim=2)
705
- clean_latents = torch.cat([clean_latents_1x, start_latent.to(history_latents)], dim=2)
706
 
707
  generated_latents = sample_hunyuan(
708
  transformer=transformer,
@@ -794,6 +796,7 @@ def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_
794
 
795
  # 20250506 pftq: Encode video
796
  start_latent, input_image_np, video_latents, fps, height, width, input_video_pixels = video_encode(input_video, resolution, no_resize, vae, vae_batch_size=vae_batch, device=gpu)
 
797
 
798
  # CLIP Vision
799
  stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'CLIP Vision encoding ...'))))
@@ -883,7 +886,7 @@ def worker_video(input_video, prompts, n_prompt, seed, batch, resolution, total_
883
  if effective_clean_frames > 0 and split_idx < len(splits):
884
  clean_latents_1x = splits[split_idx]
885
 
886
- clean_latents = torch.cat([start_latent.to(history_latents), clean_latents_1x], dim=2)
887
 
888
  # 20250507 pftq: Fix for <=1 sec videos.
889
  max_frames = min(latent_window_size * 4 - 3, history_latents.shape[2] * 4)
@@ -1341,7 +1344,7 @@ with block:
1341
  examples = [
1342
  [
1343
  "./img_examples/Example2.webp", # input_image
1344
- 100, # image_position
1345
  "A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The man talks and the woman listens; A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks and the man listens",
1346
  "image", # generation_mode
1347
  "Missing arm, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
@@ -1376,12 +1379,12 @@ with block:
1376
  0.0, # rs
1377
  6, # gpu_memory_preservation
1378
  False, # enable_preview
1379
- False, # use_teacache
1380
  16 # mp4_crf
1381
  ],
1382
  [
1383
  "./img_examples/Example3.jpg", # input_image
1384
- 0, # image_position
1385
  "A boy is walking to the right, full view, full-length view, cartoon",
1386
  "image", # generation_mode
1387
  "Missing arm, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
@@ -1396,7 +1399,7 @@ with block:
1396
  0.0, # rs
1397
  6, # gpu_memory_preservation
1398
  False, # enable_preview
1399
- True, # use_teacache
1400
  16 # mp4_crf
1401
  ],
1402
  ],
 
399
  rnd = torch.Generator("cpu").manual_seed(seed)
400
 
401
  history_latents = torch.zeros(size=(1, 16, 16 + 2 + 1, height // 8, width // 8), dtype=torch.float32).cpu()
402
+ start_latent = start_latent.to(history_latents)
403
  history_pixels = None
404
 
405
+ history_latents = torch.cat([history_latents, start_latent], dim=2)
406
  total_generated_latent_frames = 1
407
 
408
  if enable_preview:
 
482
  transformer.initialize_teacache(enable_teacache=False)
483
 
484
  clean_latents_4x, clean_latents_2x, clean_latents_1x = history_latents[:, :, -sum([16, 2, 1]):, :, :].split([16, 2, 1], dim=2)
485
+ clean_latents = torch.cat([start_latent, clean_latents_1x], dim=2)
486
 
487
  generated_latents = sample_hunyuan(
488
  transformer=transformer,
 
621
  rnd = torch.Generator("cpu").manual_seed(seed)
622
 
623
  history_latents = torch.zeros(size=(1, 16, 16 + 2 + 1, height // 8, width // 8), dtype=torch.float32).cpu()
624
+ start_latent = start_latent.to(history_latents)
625
  history_pixels = None
626
 
627
+ history_latents = torch.cat([start_latent, history_latents], dim=2)
628
  total_generated_latent_frames = 1
629
 
630
  if enable_preview:
 
704
  transformer.initialize_teacache(enable_teacache=False)
705
 
706
  clean_latents_1x, clean_latents_2x, clean_latents_4x = history_latents[:, :, :sum([1, 2, 16]), :, :].split([1, 2, 16], dim=2)
707
+ clean_latents = torch.cat([clean_latents_1x, start_latent], dim=2)
708
 
709
  generated_latents = sample_hunyuan(
710
  transformer=transformer,
 
796
 
797
  # 20250506 pftq: Encode video
798
  start_latent, input_image_np, video_latents, fps, height, width, input_video_pixels = video_encode(input_video, resolution, no_resize, vae, vae_batch_size=vae_batch, device=gpu)
799
+ start_latent = start_latent.to(dtype=torch.float32).cpu()
800
 
801
  # CLIP Vision
802
  stream.output_queue.push(('progress', (None, '', make_progress_bar_html(0, 'CLIP Vision encoding ...'))))
 
886
  if effective_clean_frames > 0 and split_idx < len(splits):
887
  clean_latents_1x = splits[split_idx]
888
 
889
+ clean_latents = torch.cat([start_latent, clean_latents_1x], dim=2)
890
 
891
  # 20250507 pftq: Fix for <=1 sec videos.
892
  max_frames = min(latent_window_size * 4 - 3, history_latents.shape[2] * 4)
 
1344
  examples = [
1345
  [
1346
  "./img_examples/Example2.webp", # input_image
1347
+ 0, # image_position
1348
  "A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The man talks and the woman listens; A man on the left and a woman on the right face each other ready to start a conversation, large space between the persons, full view, full-length view, 3D, pixar, 3D render, CGI. The woman talks and the man listens",
1349
  "image", # generation_mode
1350
  "Missing arm, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
 
1379
  0.0, # rs
1380
  6, # gpu_memory_preservation
1381
  False, # enable_preview
1382
+ True, # use_teacache
1383
  16 # mp4_crf
1384
  ],
1385
  [
1386
  "./img_examples/Example3.jpg", # input_image
1387
+ 100, # image_position
1388
  "A boy is walking to the right, full view, full-length view, cartoon",
1389
  "image", # generation_mode
1390
  "Missing arm, unrealistic position, impossible contortion, visible bone, muscle contraction, blurred, blurry", # n_prompt
 
1399
  0.0, # rs
1400
  6, # gpu_memory_preservation
1401
  False, # enable_preview
1402
+ False, # use_teacache
1403
  16 # mp4_crf
1404
  ],
1405
  ],