alex commited on
Commit
17fc24b
·
1 Parent(s): b7c34d7

guide frame

Browse files
app.py CHANGED
@@ -1404,13 +1404,14 @@ class AudioDropUpload(gr.HTML):
1404
 
1405
 
1406
 
1407
- def generate_video_example(input_image, prompt, camera_lora, resolution, radioanimated_mode, input_video, input_audio, progress=gr.Progress(track_tqdm=True)):
1408
 
1409
  w, h = apply_resolution(resolution)
1410
 
1411
  with timer(f'generating with video path:{input_video} with duration:{duration} and LoRA:{camera_lora} in {w}x{h}'):
1412
  output_video = generate_video(
1413
- input_image,
 
1414
  prompt,
1415
  10,
1416
  input_video,
@@ -1427,7 +1428,8 @@ def generate_video_example(input_image, prompt, camera_lora, resolution, radioan
1427
  return output_video
1428
 
1429
  def get_duration(
1430
- input_image,
 
1431
  prompt,
1432
  duration,
1433
  input_video,
@@ -1457,11 +1459,11 @@ def get_duration(
1457
  return 120 + extra_time
1458
  else:
1459
  return 180 + extra_time
1460
-
1461
-
1462
  @spaces.GPU(duration=get_duration)
1463
  def generate_video(
1464
- input_image,
 
1465
  prompt: str,
1466
  duration: float,
1467
  input_video = None,
@@ -1478,7 +1480,8 @@ def generate_video(
1478
  """
1479
  Generate a short cinematic video from a text prompt and optional input image using the LTX-2 distilled pipeline.
1480
  Args:
1481
- input_image: Optional input image for image-to-video. If provided, it is injected at frame 0 to guide motion.
 
1482
  prompt: Text description of the scene, motion, and cinematic style to generate.
1483
  duration: Desired video length in seconds. Converted to frames using a fixed 24 FPS rate.
1484
  input_video: Optional conditioning video path (mp4). If provided, motion is guided by this video.
@@ -1534,7 +1537,7 @@ def generate_video(
1534
  target_fps=frame_rate,
1535
  )
1536
 
1537
- if input_image is None:
1538
  images = [(first_png, 0, 1.0)]
1539
 
1540
  if audio_path is None:
@@ -1554,13 +1557,19 @@ def generate_video(
1554
  videos = [(cond_path, 1.0)]
1555
  camera_lora = "Pose"
1556
 
1557
- if input_image is not None:
1558
- images = [(input_image, 0, 1.0)]
 
 
 
 
 
 
1559
 
1560
  embeddings, final_prompt, status = encode_prompt(
1561
  prompt=prompt,
1562
  enhance_prompt=enhance_prompt,
1563
- input_image=input_image,
1564
  seed=current_seed,
1565
  negative_prompt="",
1566
  )
@@ -1653,9 +1662,10 @@ def apply_duration(duration: str):
1653
  return duration_s
1654
 
1655
  def on_mode_change(selected: str):
1656
- is_i2v = (selected == "Image-to-Video")
 
1657
 
1658
- return gr.update(visible=not is_i2v)
1659
 
1660
 
1661
 
@@ -2468,7 +2478,7 @@ def apply_example(idx: str):
2468
  idx = int(idx)
2469
 
2470
  # Read the example row from your list
2471
- img, prompt_txt, cam, res, mode, vid, aud = examples_list[idx]
2472
 
2473
  img_path = img if img else None
2474
  vid_path = vid if vid else None
@@ -2481,6 +2491,7 @@ def apply_example(idx: str):
2481
  mode_update = mode
2482
  video_update = gr.update(value=vid_path, visible=(mode == "Motion Control"))
2483
  audio_update = aud_path
 
2484
 
2485
  return (
2486
  input_image_update,
@@ -2491,6 +2502,7 @@ def apply_example(idx: str):
2491
  video_update,
2492
  audio_update,
2493
  audio_update,
 
2494
  )
2495
 
2496
 
@@ -2522,7 +2534,7 @@ with gr.Blocks(title="LTX-2 Video Distilled 🎥🔈") as demo:
2522
  with gr.Column(elem_id="col-container"):
2523
  with gr.Row(elem_id="mode-row"):
2524
  radioanimated_mode = RadioAnimated(
2525
- choices=["Image-to-Video", "Motion Control"],
2526
  value="Image-to-Video",
2527
  elem_id="radioanimated_mode"
2528
  )
@@ -2531,12 +2543,19 @@ with gr.Blocks(title="LTX-2 Video Distilled 🎥🔈") as demo:
2531
 
2532
  with gr.Row():
2533
 
2534
- input_image = gr.Image(
2535
  label="First Frame (Optional)",
2536
  type="filepath",
2537
  height=256
2538
  )
2539
-
 
 
 
 
 
 
 
2540
  input_video = gr.Video(
2541
  label="Motion Reference Video",
2542
  height=256,
@@ -2700,7 +2719,7 @@ with gr.Blocks(title="LTX-2 Video Distilled 🎥🔈") as demo:
2700
  radioanimated_mode.change(
2701
  fn=on_mode_change,
2702
  inputs=radioanimated_mode,
2703
- outputs=[input_video],
2704
  api_visibility="private",
2705
  )
2706
 
@@ -2728,7 +2747,8 @@ with gr.Blocks(title="LTX-2 Video Distilled 🎥🔈") as demo:
2728
  generate_btn.click(
2729
  fn=generate_video,
2730
  inputs=[
2731
- input_image,
 
2732
  prompt,
2733
  duration,
2734
  input_video,
@@ -2752,7 +2772,18 @@ with gr.Blocks(title="LTX-2 Video Distilled 🎥🔈") as demo:
2752
  "16:9",
2753
  "Image-to-Video",
2754
  None,
2755
- "examples/supergirl.m4a"
 
 
 
 
 
 
 
 
 
 
 
2756
  ],
2757
  [
2758
  "examples/supergirl.png",
@@ -2762,6 +2793,7 @@ with gr.Blocks(title="LTX-2 Video Distilled 🎥🔈") as demo:
2762
  "Image-to-Video",
2763
  None,
2764
  None,
 
2765
  ],
2766
  [
2767
  "examples/clay.png",
@@ -2771,6 +2803,7 @@ with gr.Blocks(title="LTX-2 Video Distilled 🎥🔈") as demo:
2771
  "Motion Control",
2772
  "examples/tiktok.mp4",
2773
  None,
 
2774
  ],
2775
  [
2776
  "examples/paint.png",
@@ -2780,6 +2813,7 @@ with gr.Blocks(title="LTX-2 Video Distilled 🎥🔈") as demo:
2780
  "Motion Control",
2781
  "examples/tiktok.mp4",
2782
  None,
 
2783
  ],
2784
  [
2785
  "examples/highland.png",
@@ -2789,6 +2823,7 @@ with gr.Blocks(title="LTX-2 Video Distilled 🎥🔈") as demo:
2789
  "Image-to-Video",
2790
  None,
2791
  None,
 
2792
  ],
2793
  [
2794
  "examples/wednesday.png",
@@ -2798,6 +2833,7 @@ with gr.Blocks(title="LTX-2 Video Distilled 🎥🔈") as demo:
2798
  "Image-to-Video",
2799
  None,
2800
  None,
 
2801
  ],
2802
  [
2803
  "examples/astronaut.png",
@@ -2807,13 +2843,14 @@ with gr.Blocks(title="LTX-2 Video Distilled 🎥🔈") as demo:
2807
  "Image-to-Video",
2808
  None,
2809
  None,
 
2810
  ],
2811
  ]
2812
 
2813
  examples_obj = create_examples(
2814
  examples=examples_list,
2815
  fn=generate_video_example,
2816
- inputs=[input_image, prompt_ui, camera_ui, resolution_ui, radioanimated_mode, input_video, audio_input],
2817
  outputs = [output_video],
2818
  label="Examples",
2819
  cache_examples=True,
@@ -2822,13 +2859,15 @@ with gr.Blocks(title="LTX-2 Video Distilled 🎥🔈") as demo:
2822
 
2823
  preset_gallery = PresetGallery(
2824
  items=[
2825
- {"thumb": "examples/supergirl-2.png", "label": "Example 1", "title": "Image+Audio to Video" },
2826
- {"thumb": "examples/supergirl.png" , "label": "Example 2", "title": "Image to Video" },
2827
- {"thumb": "examples/clay.png" , "label": "Example 3", "title": "Pose to Video" },
2828
- {"thumb": "examples/paint.png" , "label": "Example 4", "title": "Pose to Video" },
2829
- {"thumb": "examples/highland.png" , "label": "Example 5", "title": "Image to Video" },
2830
- {"thumb": "examples/wednesday.png" , "label": "Example 6", "title": "Image to Video" },
2831
- {"thumb": "examples/astronaut.png" , "label": "Example 7", "title": "Image to Video" },
 
 
2832
  ],
2833
  title="Click on Our Examples",
2834
  )
@@ -2858,7 +2897,7 @@ with gr.Blocks(title="LTX-2 Video Distilled 🎥🔈") as demo:
2858
  fn=apply_example,
2859
  inputs=preset_gallery,
2860
  outputs=[
2861
- input_image,
2862
  prompt_ui,
2863
  camera_ui,
2864
  resolution_ui,
@@ -2866,6 +2905,7 @@ with gr.Blocks(title="LTX-2 Video Distilled 🎥🔈") as demo:
2866
  input_video,
2867
  audio_input,
2868
  audio_ui,
 
2869
  ],
2870
  api_visibility="private",
2871
  ).then(
 
1404
 
1405
 
1406
 
1407
+ def generate_video_example(first_frame, prompt, camera_lora, resolution, radioanimated_mode, input_video, input_audio, end_frame, progress=gr.Progress(track_tqdm=True)):
1408
 
1409
  w, h = apply_resolution(resolution)
1410
 
1411
  with timer(f'generating with video path:{input_video} with duration:{duration} and LoRA:{camera_lora} in {w}x{h}'):
1412
  output_video = generate_video(
1413
+ first_frame,
1414
+ end_frame,
1415
  prompt,
1416
  10,
1417
  input_video,
 
1428
  return output_video
1429
 
1430
  def get_duration(
1431
+ first_frame,
1432
+ end_frame,
1433
  prompt,
1434
  duration,
1435
  input_video,
 
1459
  return 120 + extra_time
1460
  else:
1461
  return 180 + extra_time
1462
+
 
1463
  @spaces.GPU(duration=get_duration)
1464
  def generate_video(
1465
+ first_frame,
1466
+ end_frame,
1467
  prompt: str,
1468
  duration: float,
1469
  input_video = None,
 
1480
  """
1481
  Generate a short cinematic video from a text prompt and optional input image using the LTX-2 distilled pipeline.
1482
  Args:
1483
+ first_frame: Optional first frame for image-to-video. If provided, it is injected at frame 0 to guide motion.
1484
+ end_frame: Optional last frame for image-to-video. If provided, it is injected at last frame to guide motion.
1485
  prompt: Text description of the scene, motion, and cinematic style to generate.
1486
  duration: Desired video length in seconds. Converted to frames using a fixed 24 FPS rate.
1487
  input_video: Optional conditioning video path (mp4). If provided, motion is guided by this video.
 
1537
  target_fps=frame_rate,
1538
  )
1539
 
1540
+ if first_frame is None:
1541
  images = [(first_png, 0, 1.0)]
1542
 
1543
  if audio_path is None:
 
1557
  videos = [(cond_path, 1.0)]
1558
  camera_lora = "Pose"
1559
 
1560
+ if first_frame is not None:
1561
+ images = []
1562
+ images.append((first_frame, 0, 1.0))
1563
+
1564
+ if generation_mode == "Interpolate":
1565
+ if end_frame is not None:
1566
+ end_idx = max(0, num_frames - 1)
1567
+ images.append((end_frame, end_idx, 0.5))
1568
 
1569
  embeddings, final_prompt, status = encode_prompt(
1570
  prompt=prompt,
1571
  enhance_prompt=enhance_prompt,
1572
+ input_image=first_frame,
1573
  seed=current_seed,
1574
  negative_prompt="",
1575
  )
 
1662
  return duration_s
1663
 
1664
  def on_mode_change(selected: str):
1665
+ is_motion = (selected == "Motion Control")
1666
+ is_interpolate = (selected == "Interpolate")
1667
 
1668
+ return (gr.update(visible=is_motion), gr.update(visible=is_interpolate))
1669
 
1670
 
1671
 
 
2478
  idx = int(idx)
2479
 
2480
  # Read the example row from your list
2481
+ img, prompt_txt, cam, res, mode, vid, aud, end_img = examples_list[idx]
2482
 
2483
  img_path = img if img else None
2484
  vid_path = vid if vid else None
 
2491
  mode_update = mode
2492
  video_update = gr.update(value=vid_path, visible=(mode == "Motion Control"))
2493
  audio_update = aud_path
2494
+ end_image = end_img
2495
 
2496
  return (
2497
  input_image_update,
 
2502
  video_update,
2503
  audio_update,
2504
  audio_update,
2505
+ end_image,
2506
  )
2507
 
2508
 
 
2534
  with gr.Column(elem_id="col-container"):
2535
  with gr.Row(elem_id="mode-row"):
2536
  radioanimated_mode = RadioAnimated(
2537
+ choices=["Image-to-Video", "Interpolate", "Motion Control"],
2538
  value="Image-to-Video",
2539
  elem_id="radioanimated_mode"
2540
  )
 
2543
 
2544
  with gr.Row():
2545
 
2546
+ first_frame = gr.Image(
2547
  label="First Frame (Optional)",
2548
  type="filepath",
2549
  height=256
2550
  )
2551
+
2552
+ end_frame = gr.Image(
2553
+ label="Last Frame (Optional)",
2554
+ type="filepath",
2555
+ height=256,
2556
+ visible=False,
2557
+ )
2558
+
2559
  input_video = gr.Video(
2560
  label="Motion Reference Video",
2561
  height=256,
 
2719
  radioanimated_mode.change(
2720
  fn=on_mode_change,
2721
  inputs=radioanimated_mode,
2722
+ outputs=[input_video, end_frame],
2723
  api_visibility="private",
2724
  )
2725
 
 
2747
  generate_btn.click(
2748
  fn=generate_video,
2749
  inputs=[
2750
+ first_frame,
2751
+ end_frame,
2752
  prompt,
2753
  duration,
2754
  input_video,
 
2772
  "16:9",
2773
  "Image-to-Video",
2774
  None,
2775
+ "examples/supergirl.m4a",
2776
+ None,
2777
+ ],
2778
+ [
2779
+ "examples/frame3.png",
2780
+ "a woman in a white dress standing in a supermarket, looking at a stack of pomegranates, she picks one and takes a bite, the camera zooms in to a close up of the pomegranate seeds. A calm music is playing in the supermarket and you can hear her taking a bite.",
2781
+ "Zoom In",
2782
+ "16:9",
2783
+ "Interpolate",
2784
+ None,
2785
+ None,
2786
+ "examples/frame4.png",
2787
  ],
2788
  [
2789
  "examples/supergirl.png",
 
2793
  "Image-to-Video",
2794
  None,
2795
  None,
2796
+ None,
2797
  ],
2798
  [
2799
  "examples/clay.png",
 
2803
  "Motion Control",
2804
  "examples/tiktok.mp4",
2805
  None,
2806
+ None,
2807
  ],
2808
  [
2809
  "examples/paint.png",
 
2813
  "Motion Control",
2814
  "examples/tiktok.mp4",
2815
  None,
2816
+ None,
2817
  ],
2818
  [
2819
  "examples/highland.png",
 
2823
  "Image-to-Video",
2824
  None,
2825
  None,
2826
+ None,
2827
  ],
2828
  [
2829
  "examples/wednesday.png",
 
2833
  "Image-to-Video",
2834
  None,
2835
  None,
2836
+ None,
2837
  ],
2838
  [
2839
  "examples/astronaut.png",
 
2843
  "Image-to-Video",
2844
  None,
2845
  None,
2846
+ None,
2847
  ],
2848
  ]
2849
 
2850
  examples_obj = create_examples(
2851
  examples=examples_list,
2852
  fn=generate_video_example,
2853
+ inputs=[first_frame, prompt_ui, camera_ui, resolution_ui, radioanimated_mode, input_video, audio_input, end_frame],
2854
  outputs = [output_video],
2855
  label="Examples",
2856
  cache_examples=True,
 
2859
 
2860
  preset_gallery = PresetGallery(
2861
  items=[
2862
+ {"thumb": "examples/supergirl-2.png", "label": "Example 1", "title": "Image + Audio to Video" },
2863
+ {"thumb": "examples/frame3.png" , "label": "Example 2", "title": "First and Last Frame" },
2864
+ {"thumb": "examples/supergirl.png" , "label": "Example 3", "title": "Image to Video" },
2865
+ {"thumb": "examples/clay.png" , "label": "Example 4", "title": "Pose to Video" },
2866
+ {"thumb": "examples/paint.png" , "label": "Example 5", "title": "Pose to Video" },
2867
+ {"thumb": "examples/highland.png" , "label": "Example 6", "title": "Image to Video" },
2868
+ {"thumb": "examples/wednesday.png" , "label": "Example 7", "title": "Image to Video" },
2869
+ {"thumb": "examples/astronaut.png" , "label": "Example 8", "title": "Image to Video" },
2870
+
2871
  ],
2872
  title="Click on Our Examples",
2873
  )
 
2897
  fn=apply_example,
2898
  inputs=preset_gallery,
2899
  outputs=[
2900
+ first_frame,
2901
  prompt_ui,
2902
  camera_ui,
2903
  resolution_ui,
 
2905
  input_video,
2906
  audio_input,
2907
  audio_ui,
2908
+ end_frame,
2909
  ],
2910
  api_visibility="private",
2911
  ).then(
packages/ltx-pipelines/src/ltx_pipelines/distilled.py CHANGED
@@ -6,6 +6,7 @@ import torch
6
 
7
  from ltx_core.components.diffusion_steps import EulerDiffusionStep
8
  from ltx_core.loader.sd_ops import LTXV_LORA_COMFY_RENAMING_MAP
 
9
  from ltx_core.components.noisers import GaussianNoiser
10
  from ltx_core.components.protocols import DiffusionStepProtocol
11
  from ltx_core.conditioning import ConditioningItem, VideoConditionByKeyframeIndex, ConditioningError
@@ -34,6 +35,7 @@ from ltx_pipelines.utils.helpers import (
34
  generate_enhanced_prompt,
35
  get_device,
36
  image_conditionings_by_replacing_latent,
 
37
  simple_denoising_func,
38
  )
39
  from ltx_pipelines.utils.media_io import encode_video, load_video_conditioning
@@ -295,19 +297,8 @@ class DistilledPipeline:
295
  strength=audio_strength,
296
  )
297
 
298
- # Use pre-computed embeddings if provided, otherwise encode text
299
- if video_context is None or audio_context is None:
300
- text_encoder = self.model_ledger.text_encoder()
301
- context_p = encode_text(text_encoder, prompts=[prompt])[0]
302
- video_context, audio_context = context_p
303
-
304
- torch.cuda.synchronize()
305
- del text_encoder
306
- utils.cleanup_memory()
307
- else:
308
- # Move pre-computed embeddings to device if needed
309
- video_context = video_context.to(self.device)
310
- audio_context = audio_context.to(self.device)
311
 
312
  # Stage 1: Initial low resolution video generation.
313
  # Load models only if not already cached
@@ -319,6 +310,7 @@ class DistilledPipeline:
319
  self._transformer = self.model_ledger.transformer()
320
  transformer = self._transformer
321
  stage_1_sigmas = torch.Tensor(DISTILLED_SIGMA_VALUES).to(self.device)
 
322
 
323
  def denoising_loop(
324
  sigmas: torch.Tensor, video_state: LatentState, audio_state: LatentState, stepper: DiffusionStepProtocol
@@ -335,7 +327,13 @@ class DistilledPipeline:
335
  ),
336
  )
337
 
338
- stage_1_output_shape = VideoPixelShape(batch=1, frames=num_frames, width=width, height=height, fps=frame_rate)
 
 
 
 
 
 
339
  stage_1_conditionings = self._create_conditionings(
340
  images=images,
341
  video_conditioning=video_conditioning,
@@ -370,8 +368,13 @@ class DistilledPipeline:
370
 
371
  stage_2_sigmas = torch.Tensor(STAGE_2_DISTILLED_SIGMA_VALUES).to(self.device)
372
  stage_2_output_shape = VideoPixelShape(
373
- batch=1, frames=num_frames, width=width * 2, height=height * 2, fps=frame_rate
 
 
 
 
374
  )
 
375
  if apply_video_conditioning_to_stage2:
376
  stage_2_conditionings = self._create_conditionings(
377
  images=images,
@@ -383,7 +386,7 @@ class DistilledPipeline:
383
  video_conditioning_frame_idx=video_conditioning_frame_idx,
384
  )
385
  else:
386
- stage_2_conditionings = image_conditionings_by_replacing_latent(
387
  images=images,
388
  height=stage_2_output_shape.height,
389
  width=stage_2_output_shape.width,
@@ -437,15 +440,37 @@ class DistilledPipeline:
437
  video_conditioning_frame_idx: int,
438
  dtype: torch.dtype,
439
  ):
440
- # 1) Keep ORIGINAL behavior: image conditioning by replacing latent
441
- conditionings = image_conditionings_by_replacing_latent(
442
- images=images,
443
- height=height,
444
- width=width,
445
- video_encoder=video_encoder,
446
- dtype=dtype,
447
- device=self.device,
448
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
449
 
450
  # 2) Optional: add video conditioning (IC-LoRA style)
451
  if not video_conditioning:
 
6
 
7
  from ltx_core.components.diffusion_steps import EulerDiffusionStep
8
  from ltx_core.loader.sd_ops import LTXV_LORA_COMFY_RENAMING_MAP
9
+ from ltx_core.components.schedulers import LTX2Scheduler
10
  from ltx_core.components.noisers import GaussianNoiser
11
  from ltx_core.components.protocols import DiffusionStepProtocol
12
  from ltx_core.conditioning import ConditioningItem, VideoConditionByKeyframeIndex, ConditioningError
 
35
  generate_enhanced_prompt,
36
  get_device,
37
  image_conditionings_by_replacing_latent,
38
+ image_conditionings_by_adding_guiding_latent,
39
  simple_denoising_func,
40
  )
41
  from ltx_pipelines.utils.media_io import encode_video, load_video_conditioning
 
297
  strength=audio_strength,
298
  )
299
 
300
+ video_context = video_context.to(self.device)
301
+ audio_context = audio_context.to(self.device)
 
 
 
 
 
 
 
 
 
 
 
302
 
303
  # Stage 1: Initial low resolution video generation.
304
  # Load models only if not already cached
 
310
  self._transformer = self.model_ledger.transformer()
311
  transformer = self._transformer
312
  stage_1_sigmas = torch.Tensor(DISTILLED_SIGMA_VALUES).to(self.device)
313
+ # stage_1_sigmas = LTX2Scheduler().execute(steps=40).to(dtype=torch.float32, device=self.device)
314
 
315
  def denoising_loop(
316
  sigmas: torch.Tensor, video_state: LatentState, audio_state: LatentState, stepper: DiffusionStepProtocol
 
327
  ),
328
  )
329
 
330
+ stage_1_output_shape = VideoPixelShape(
331
+ batch=1,
332
+ frames=num_frames,
333
+ width=width,
334
+ height=height,
335
+ fps=frame_rate,
336
+ )
337
  stage_1_conditionings = self._create_conditionings(
338
  images=images,
339
  video_conditioning=video_conditioning,
 
368
 
369
  stage_2_sigmas = torch.Tensor(STAGE_2_DISTILLED_SIGMA_VALUES).to(self.device)
370
  stage_2_output_shape = VideoPixelShape(
371
+ batch=1,
372
+ frames=num_frames,
373
+ width=width * 2,
374
+ height=height * 2,
375
+ fps=frame_rate,
376
  )
377
+
378
  if apply_video_conditioning_to_stage2:
379
  stage_2_conditionings = self._create_conditionings(
380
  images=images,
 
386
  video_conditioning_frame_idx=video_conditioning_frame_idx,
387
  )
388
  else:
389
+ stage_2_conditionings = image_conditionings_by_adding_guiding_latent(
390
  images=images,
391
  height=stage_2_output_shape.height,
392
  width=stage_2_output_shape.width,
 
440
  video_conditioning_frame_idx: int,
441
  dtype: torch.dtype,
442
  ):
443
+ # First frame: replace-latent (strong anchor)
444
+ replace_imgs = []
445
+ # End frame: guiding-latent (avoid empty slice)
446
+ guide_imgs = []
447
+
448
+ for path, frame_idx, strength in images:
449
+ if frame_idx == 0:
450
+ replace_imgs.append((path, frame_idx, strength))
451
+ else:
452
+ guide_imgs.append((path, frame_idx, strength))
453
+
454
+ conditionings = []
455
+ if replace_imgs:
456
+ conditionings += image_conditionings_by_replacing_latent(
457
+ images=replace_imgs,
458
+ height=height,
459
+ width=width,
460
+ video_encoder=video_encoder,
461
+ dtype=dtype,
462
+ device=self.device,
463
+ )
464
+ if guide_imgs:
465
+ conditionings += image_conditionings_by_adding_guiding_latent(
466
+ images=guide_imgs,
467
+ height=height,
468
+ width=width,
469
+ video_encoder=video_encoder,
470
+ dtype=dtype,
471
+ device=self.device,
472
+ )
473
+
474
 
475
  # 2) Optional: add video conditioning (IC-LoRA style)
476
  if not video_conditioning: