alexnasa commited on
Commit
c1c515b
·
verified ·
1 Parent(s): a733a26

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +457 -49
app.py CHANGED
@@ -8,6 +8,15 @@ import torch.nn.functional as F
8
  import torchaudio
9
  import os
10
  from typing import Any
 
 
 
 
 
 
 
 
 
11
 
12
  def _coerce_audio_path(audio_path: Any) -> str:
13
  # Common Gradio case: tuple where first item is the filepath
@@ -25,6 +34,41 @@ def _coerce_audio_path(audio_path: Any) -> str:
25
 
26
  return os.fspath(audio_path)
27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
 
30
  def match_audio_to_duration(
@@ -75,8 +119,6 @@ def match_audio_to_duration(
75
 
76
  def sh(cmd): subprocess.check_call(cmd, shell=True)
77
 
78
- sh("pip install --no-deps easy_dwpose")
79
-
80
  # Add packages to Python path
81
  current_dir = Path(__file__).parent
82
  sys.path.insert(0, str(current_dir / "packages" / "ltx-pipelines" / "src"))
@@ -118,8 +160,8 @@ from ltx_pipelines.utils import ModelLedger
118
  from ltx_pipelines.utils.helpers import generate_enhanced_prompt
119
  import imageio
120
  import cv2
121
- from controlnet_aux import CannyDetector
122
- from easy_dwpose import DWposeDetector
123
 
124
 
125
  # HuggingFace Hub defaults
@@ -166,6 +208,9 @@ model_ledger = ModelLedger(
166
  )
167
 
168
  canny_processor = CannyDetector()
 
 
 
169
 
170
 
171
  # Load text encoder once and keep it in memory
@@ -186,7 +231,7 @@ def on_lora_change(selected: str):
186
 
187
  def process_video_for_pose(frames, width: int, height: int):
188
 
189
- pose_processor = DWposeDetector("cuda")
190
 
191
  if not frames:
192
  return []
@@ -197,7 +242,7 @@ def process_video_for_pose(frames, width: int, height: int):
197
  pil = Image.fromarray(frame.astype(np.uint8)).convert("RGB")
198
 
199
  # ✅ do NOT pass width/height here (easy_dwpose will handle drawing sizes internally)
200
- pose_img = pose_processor(pil)
201
 
202
  # Ensure it's PIL then resize to your conditioning size
203
  if not isinstance(pose_img, Image.Image):
@@ -219,6 +264,46 @@ def preprocess_video_to_pose_mp4(video_path: str, width: int, height: int, fps:
219
  tmp.close()
220
  return write_video_mp4(pose_frames, fps=fps, out_path=tmp.name)
221
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
222
 
223
  def load_video_frames(video_path: str):
224
  """Return list of frames as numpy arrays (H,W,3) uint8."""
@@ -230,7 +315,7 @@ def load_video_frames(video_path: str):
230
 
231
 
232
  def process_video_for_canny(frames, width: int, height: int,
233
- low_threshold=50, high_threshold=200):
234
  """
235
  Convert RGB frames -> canny edge frames.
236
  Returns list of np arrays (H,W,3) in float [0..1] (like controlnet_aux output).
@@ -244,6 +329,8 @@ def process_video_for_canny(frames, width: int, height: int,
244
  canny_frames = []
245
  for frame in frames:
246
  # controlnet_aux CannyDetector returns float image in [0..1] if output_type="np"
 
 
247
  canny = canny_processor(
248
  frame,
249
  low_threshold=low_threshold,
@@ -277,6 +364,158 @@ def preprocess_video_to_canny_mp4(video_path: str, width: int, height: int, fps:
277
  tmp.close()
278
  return write_video_mp4(canny_frames, fps=fps, out_path=tmp.name)
279
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
280
 
281
  def encode_text_simple(text_encoder, prompt: str):
282
  """Simple text encoding without using pipeline_utils."""
@@ -420,6 +659,13 @@ detailer_lora_path = get_hub_or_local_checkpoint(
420
  "ltx-2-19b-ic-lora-detailer.safetensors",
421
  )
422
 
 
 
 
 
 
 
 
423
  # Load distilled LoRA as a regular LoRA
424
  loras = [
425
  # --- fused / base behavior ---
@@ -436,9 +682,22 @@ loras = [
436
  LoraPathStrengthAndSDOps(dolly_right_lora_path, DEFAULT_LORA_STRENGTH, LTXV_LORA_COMFY_RENAMING_MAP),
437
  LoraPathStrengthAndSDOps(jib_down_lora_path, DEFAULT_LORA_STRENGTH, LTXV_LORA_COMFY_RENAMING_MAP),
438
  LoraPathStrengthAndSDOps(jib_up_lora_path, DEFAULT_LORA_STRENGTH, LTXV_LORA_COMFY_RENAMING_MAP),
 
439
  ]
440
 
441
  # Runtime-toggle LoRAs (exclude fused distilled at index 0)
 
 
 
 
 
 
 
 
 
 
 
 
442
  RUNTIME_LORA_CHOICES = [
443
  ("No LoRA", -1),
444
  ("Static", 0),
@@ -449,7 +708,7 @@ RUNTIME_LORA_CHOICES = [
449
  ("Slide Right", 5),
450
  ("Slide Down", 6),
451
  ("Slide Up", 7),
452
-
453
  ]
454
 
455
  # Initialize pipeline WITHOUT text encoder (gemma_root=None)
@@ -556,6 +815,18 @@ class RadioAnimated(gr.HTML):
556
 
557
  // Recalc on resize (important in Gradio layouts)
558
  window.addEventListener('resize', () => setHighlightByIndex(currentIdx));
 
 
 
 
 
 
 
 
 
 
 
 
559
  })();
560
 
561
  """
@@ -818,30 +1089,34 @@ class CameraDropdown(gr.HTML):
818
  **kwargs
819
  )
820
 
821
- def generate_video_example(input_image, prompt, camera_lora, resolution, audio_path = None, progress=gr.Progress(track_tqdm=True)):
822
 
823
  w, h = apply_resolution(resolution)
824
 
825
- output_video = generate_video(
826
- input_image,
827
- prompt,
828
- 10, # duration seconds
829
- True, # enhance_prompt
830
- 42, # seed
831
- True, # randomize_seed
832
- h, # height
833
- w, # width
834
- camera_lora,
835
- audio_path,
836
- progress
837
- )
838
-
 
 
839
  return output_video
840
 
841
  def get_duration(
842
  input_image,
843
  prompt,
844
  duration,
 
 
845
  enhance_prompt,
846
  seed,
847
  randomize_seed,
@@ -856,6 +1131,9 @@ def get_duration(
856
  if audio_path is not None:
857
  extra_time += 10
858
 
 
 
 
859
  if duration <= 3:
860
  return 60 + extra_time
861
  elif duration <= 5:
@@ -865,11 +1143,14 @@ def get_duration(
865
  else:
866
  return 180 + extra_time
867
 
 
868
  @spaces.GPU(duration=get_duration)
869
  def generate_video(
870
  input_image,
871
  prompt: str,
872
  duration: float,
 
 
873
  enhance_prompt: bool = True,
874
  seed: int = 42,
875
  randomize_seed: bool = True,
@@ -885,6 +1166,7 @@ def generate_video(
885
  input_image: Optional input image for image-to-video. If provided, it is injected at frame 0 to guide motion.
886
  prompt: Text description of the scene, motion, and cinematic style to generate.
887
  duration: Desired video length in seconds. Converted to frames using a fixed 24 FPS rate.
 
888
  enhance_prompt: Whether to enhance the prompt using the prompt enhancer before encoding.
889
  seed: Base random seed for reproducibility (ignored if randomize_seed is True).
890
  randomize_seed: If True, a random seed is generated for each run.
@@ -920,12 +1202,42 @@ def generate_video(
920
  # Calculate num_frames from duration (using fixed 24 fps)
921
  frame_rate = 24.0
922
  num_frames = int(duration * frame_rate) + 1 # +1 to ensure we meet the duration
 
923
 
924
  with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmpfile:
925
  output_path = tmpfile.name
926
 
927
 
928
  images = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
929
 
930
  if input_image is not None:
931
  images = [(input_image, 0, 1.0)]
@@ -955,7 +1267,9 @@ def generate_video(
955
  _, n_audio_context = encode_text_simple(text_encoder, "") # returns tensors on GPU already
956
  del audio_context
957
  audio_context = n_audio_context
958
- camera_lora = "Static"
 
 
959
 
960
  torch.cuda.empty_cache()
961
 
@@ -982,23 +1296,24 @@ def generate_video(
982
  input_waveform = None
983
  input_waveform_sample_rate = None
984
 
985
- # Run inference - progress automatically tracks tqdm from pipeline
986
- with torch.inference_mode():
987
- pipeline(
988
- prompt=prompt,
989
- output_path=str(output_path),
990
- seed=current_seed,
991
- height=height,
992
- width=width,
993
- num_frames=num_frames,
994
- frame_rate=frame_rate,
995
- images=images,
996
- tiling_config=TilingConfig.default(),
997
- video_context=video_context,
998
- audio_context=audio_context,
999
- input_waveform=input_waveform,
1000
- input_waveform_sample_rate=input_waveform_sample_rate,
1001
- )
 
1002
  del video_context, audio_context
1003
  torch.cuda.empty_cache()
1004
  print("successful generation")
@@ -1022,6 +1337,12 @@ def apply_duration(duration: str):
1022
  duration_s = int(duration[:-1])
1023
  return duration_s
1024
 
 
 
 
 
 
 
1025
 
1026
  css = """
1027
 
@@ -1130,6 +1451,45 @@ css = """
1130
  #false:checked ~ .toggle-highlight {
1131
  transform: translateX(100%);
1132
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1133
  """
1134
 
1135
  css += """
@@ -1678,15 +2038,27 @@ with gr.Blocks(title="LTX-2 Video Distilled 🎥🔈") as demo:
1678
  """
1679
  )
1680
  with gr.Column(elem_id="col-container"):
 
 
 
 
 
 
1681
  with gr.Row():
1682
  with gr.Column(elem_id="step-column"):
1683
-
1684
  input_image = gr.Image(
1685
  label="First Frame (Optional)",
1686
  type="filepath",
1687
  height=256
1688
  )
1689
 
 
 
 
 
 
 
1690
  relocate = gr.HTML(
1691
  value="",
1692
  html_template="<div></div>",
@@ -1757,7 +2129,7 @@ with gr.Blocks(title="LTX-2 Video Distilled 🎥🔈") as demo:
1757
 
1758
  with gr.Column(elem_id="step-column"):
1759
  output_video = gr.Video(label="Generated Video", autoplay=True, height=512)
1760
-
1761
  with gr.Row(elem_id="controls-row"):
1762
 
1763
  duration_ui = CameraDropdown(
@@ -1805,7 +2177,7 @@ with gr.Blocks(title="LTX-2 Video Distilled 🎥🔈") as demo:
1805
  height = gr.Number(label="Height", value=DEFAULT_1_STAGE_HEIGHT, precision=0, visible=False)
1806
 
1807
  camera_ui = CameraDropdown(
1808
- choices=[name for name, _ in RUNTIME_LORA_CHOICES],
1809
  value="No LoRA",
1810
  title="Camera LoRA",
1811
  elem_id="camera_ui",
@@ -1814,7 +2186,7 @@ with gr.Blocks(title="LTX-2 Video Distilled 🎥🔈") as demo:
1814
  # Hidden real dropdown (backend value)
1815
  camera_lora = gr.Dropdown(
1816
  label="Camera Control LoRA",
1817
- choices=[name for name, _ in RUNTIME_LORA_CHOICES],
1818
  value="No LoRA",
1819
  visible=False
1820
  )
@@ -1828,6 +2200,14 @@ with gr.Blocks(title="LTX-2 Video Distilled 🎥🔈") as demo:
1828
  api_visibility="private"
1829
  )
1830
 
 
 
 
 
 
 
 
 
1831
  duration_ui.change(
1832
  fn=apply_duration,
1833
  inputs=duration_ui,
@@ -1854,6 +2234,8 @@ with gr.Blocks(title="LTX-2 Video Distilled 🎥🔈") as demo:
1854
  input_image,
1855
  prompt,
1856
  duration,
 
 
1857
  enhance_prompt,
1858
  seed,
1859
  randomize_seed,
@@ -1873,6 +2255,8 @@ with gr.Blocks(title="LTX-2 Video Distilled 🎥🔈") as demo:
1873
  "A fuzzy puppet superhero character resembling a female puppet with blonde hair and a blue superhero suit sleeping in bed and just waking up, she gradually gets up, rubbing her eyes and looking at her dog that just popped on the bed. the scene feels chaotic, comedic, and emotional with expressive puppet reactions, cinematic lighting, smooth camera motion, shallow depth of field, and high-quality puppet-style animation",
1874
  "Static",
1875
  "16:9",
 
 
1876
  "supergirl.m4a"
1877
  ],
1878
  [
@@ -1880,13 +2264,35 @@ with gr.Blocks(title="LTX-2 Video Distilled 🎥🔈") as demo:
1880
  "A fuzzy puppet superhero character resembling a female puppet with blonde hair and a blue superhero suit stands inside an icy cave made of frozen walls and icicles, she looks panicked and frantic, rapidly turning her head left and right and scanning the cave while waving her arms and shouting angrily and desperately, mouthing the words “where the hell is my dog,” her movements exaggerated and puppet-like with high energy and urgency, suddenly a second puppet dog bursts into frame from the side, jumping up excitedly and tackling her affectionately while licking her face repeatedly, she freezes in surprise and then breaks into relief and laughter as the dog continues licking her, the scene feels chaotic, comedic, and emotional with expressive puppet reactions, cinematic lighting, smooth camera motion, shallow depth of field, and high-quality puppet-style animation",
1881
  "No LoRA",
1882
  "16:9",
 
 
 
 
 
 
 
 
 
 
 
1883
  None,
1884
  ],
 
 
 
 
 
 
 
 
 
1885
  [
1886
  "highland.png",
1887
  "Realistic POV selfie-style video in a snowy, foggy field. Two shaggy Highland cows with long curved horns stand ahead. The camera is handheld and slightly shaky. The woman filming talks nervously and excitedly in a vlog tone: \"Oh my god guys… look how big those horns are… I’m kinda scared.\" The cow on the left walks toward the camera in a cute, bouncy, hopping way, curious and gentle. Snow crunches under its hooves, breath visible in the cold air. The horns look massive from the POV. As the cow gets very close, its wet nose with slight dripping fills part of the frame. She laughs nervously but reaches out and pets the cow. The cow makes deep, soft, interesting mooing and snorting sounds, calm and friendly. Ultra-realistic, natural lighting, immersive audio, documentary-style realism.",
1888
  "No LoRA",
1889
  "16:9",
 
 
1890
  None,
1891
  ],
1892
  [
@@ -1894,6 +2300,8 @@ with gr.Blocks(title="LTX-2 Video Distilled 🎥🔈") as demo:
1894
  "A cinematic dolly out of Wednesday Addams frozen mid-dance on a dark, blue-lit ballroom floor as students move indistinctly behind her, their footsteps and muffled music reduced to a distant, underwater thrum; the audio foregrounds her steady breathing and the faint rustle of fabric as she slowly raises one arm, never breaking eye contact with the camera, then after a deliberately long silence she speaks in a flat, dry, perfectly controlled voice, “I don’t dance… I vibe code,” each word crisp and unemotional, followed by an abrupt cutoff of her voice as the background sound swells slightly, reinforcing the deadpan humor, with precise lip sync, minimal facial movement, stark gothic lighting, and cinematic realism.",
1895
  "Zoom Out",
1896
  "16:9",
 
 
1897
  None,
1898
  ],
1899
  [
@@ -1901,15 +2309,15 @@ with gr.Blocks(title="LTX-2 Video Distilled 🎥🔈") as demo:
1901
  "An astronaut hatches from a fragile egg on the surface of the Moon, the shell cracking and peeling apart in gentle low-gravity motion. Fine lunar dust lifts and drifts outward with each movement, floating in slow arcs before settling back onto the ground. The astronaut pushes free in a deliberate, weightless motion, small fragments of the egg tumbling and spinning through the air. In the background, the deep darkness of space subtly shifts as stars glide with the camera's movement, emphasizing vast depth and scale. The camera performs a smooth, cinematic slow push-in, with natural parallax between the foreground dust, the astronaut, and the distant starfield. Ultra-realistic detail, physically accurate low-gravity motion, cinematic lighting, and a breath-taking, movie-like shot.",
1902
  "Static",
1903
  "1:1",
 
 
1904
  None,
1905
  ],
1906
-
1907
-
1908
  ],
1909
  fn=generate_video_example,
1910
- inputs=[input_image, prompt_ui, camera_ui, resolution_ui, audio_input],
1911
  outputs = [output_video],
1912
- label="I2V Examples",
1913
  cache_examples=True,
1914
  )
1915
 
 
8
  import torchaudio
9
  import os
10
  from typing import Any
11
+ import time
12
+ from contextlib import contextmanager
13
+
14
+ @contextmanager
15
+ def timer(name: str):
16
+ start = time.time()
17
+ print(f"{name}...")
18
+ yield
19
+ print(f" -> {name} completed in {time.time() - start:.2f} sec")
20
 
21
  def _coerce_audio_path(audio_path: Any) -> str:
22
  # Common Gradio case: tuple where first item is the filepath
 
34
 
35
  return os.fspath(audio_path)
36
 
37
+ def extract_audio_wav_ffmpeg(video_path: str, target_sr: int = 48000) -> str | None:
38
+ """
39
+ Extract audio from a video into a temp WAV (mono, target_sr).
40
+ Returns path, or None if the video has no audio stream.
41
+ """
42
+ out_path = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name
43
+
44
+ # Check if there's an audio stream
45
+ probe_cmd = [
46
+ "ffprobe", "-v", "error",
47
+ "-select_streams", "a:0",
48
+ "-show_entries", "stream=codec_type",
49
+ "-of", "default=nw=1:nk=1",
50
+ video_path,
51
+ ]
52
+ try:
53
+ out = subprocess.check_output(probe_cmd).decode("utf-8").strip()
54
+ if not out:
55
+ return None
56
+ except subprocess.CalledProcessError:
57
+ return None
58
+
59
+ # Extract + resample + mono
60
+ cmd = [
61
+ "ffmpeg", "-y", "-v", "error",
62
+ "-i", video_path,
63
+ "-vn",
64
+ "-ac", "1",
65
+ "-ar", str(int(target_sr)),
66
+ "-c:a", "pcm_s16le",
67
+ out_path
68
+ ]
69
+ subprocess.check_call(cmd)
70
+ return out_path
71
+
72
 
73
 
74
  def match_audio_to_duration(
 
119
 
120
  def sh(cmd): subprocess.check_call(cmd, shell=True)
121
 
 
 
122
  # Add packages to Python path
123
  current_dir = Path(__file__).parent
124
  sys.path.insert(0, str(current_dir / "packages" / "ltx-pipelines" / "src"))
 
160
  from ltx_pipelines.utils.helpers import generate_enhanced_prompt
161
  import imageio
162
  import cv2
163
+ from controlnet_aux import CannyDetector, MidasDetector
164
+ from dwpose import DwposeDetector
165
 
166
 
167
  # HuggingFace Hub defaults
 
208
  )
209
 
210
  canny_processor = CannyDetector()
211
+ # Depth (MiDaS) processor
212
+ # Downloads annotator weights automatically the first time.
213
+ depth_processor = MidasDetector.from_pretrained("lllyasviel/Annotators").to("cuda")
214
 
215
 
216
  # Load text encoder once and keep it in memory
 
231
 
232
  def process_video_for_pose(frames, width: int, height: int):
233
 
234
+ pose_processor = DwposeDetector.from_pretrained_default()
235
 
236
  if not frames:
237
  return []
 
242
  pil = Image.fromarray(frame.astype(np.uint8)).convert("RGB")
243
 
244
  # ✅ do NOT pass width/height here (easy_dwpose will handle drawing sizes internally)
245
+ pose_img = pose_processor(pil, include_body=True, include_hand=True, include_face=True)
246
 
247
  # Ensure it's PIL then resize to your conditioning size
248
  if not isinstance(pose_img, Image.Image):
 
264
  tmp.close()
265
  return write_video_mp4(pose_frames, fps=fps, out_path=tmp.name)
266
 
267
+ def process_video_for_depth(frames, width: int, height: int):
268
+ """
269
+ Convert RGB frames -> depth map frames.
270
+ Returns list of np arrays (H,W,3) float in [0..1] (controlnet-style).
271
+ """
272
+ if not frames:
273
+ return []
274
+
275
+ detect_resolution = max(frames[0].shape[0], frames[0].shape[1])
276
+ image_resolution = max(width, height)
277
+
278
+ depth_frames = []
279
+ for frame in frames:
280
+ # controlnet_aux MidasDetector returns float [0..1] when output_type="np"
281
+ depth = depth_processor(
282
+ frame,
283
+ detect_resolution=detect_resolution,
284
+ image_resolution=image_resolution,
285
+ output_type="np",
286
+ )
287
+
288
+ # Safety: ensure HWC and 3 channels (some versions may output 1ch)
289
+ if depth.ndim == 2:
290
+ depth = np.stack([depth, depth, depth], axis=-1)
291
+ elif depth.shape[-1] == 1:
292
+ depth = np.repeat(depth, 3, axis=-1)
293
+
294
+ depth_frames.append(depth)
295
+
296
+ return depth_frames
297
+
298
+
299
+ def preprocess_video_to_depth_mp4(video_path: str, width: int, height: int, fps: float):
300
+ """End-to-end: read video -> depth -> write temp mp4 -> return path."""
301
+ frames = load_video_frames(video_path)
302
+ depth_frames = process_video_for_depth(frames, width=width, height=height)
303
+ tmp = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False)
304
+ tmp.close()
305
+ return write_video_mp4(depth_frames, fps=fps, out_path=tmp.name)
306
+
307
 
308
  def load_video_frames(video_path: str):
309
  """Return list of frames as numpy arrays (H,W,3) uint8."""
 
315
 
316
 
317
  def process_video_for_canny(frames, width: int, height: int,
318
+ low_threshold=20, high_threshold=60):
319
  """
320
  Convert RGB frames -> canny edge frames.
321
  Returns list of np arrays (H,W,3) in float [0..1] (like controlnet_aux output).
 
329
  canny_frames = []
330
  for frame in frames:
331
  # controlnet_aux CannyDetector returns float image in [0..1] if output_type="np"
332
+ # frame_blur = cv2.GaussianBlur(frame, (3, 3), 0)
333
+
334
  canny = canny_processor(
335
  frame,
336
  low_threshold=low_threshold,
 
364
  tmp.close()
365
  return write_video_mp4(canny_frames, fps=fps, out_path=tmp.name)
366
 
367
+ import json
368
+
369
+ def probe_video_duration_seconds(video_path: str) -> float:
370
+ """Return duration in seconds using ffprobe."""
371
+ cmd = [
372
+ "ffprobe", "-v", "error",
373
+ "-select_streams", "v:0",
374
+ "-show_entries", "format=duration",
375
+ "-of", "json",
376
+ video_path,
377
+ ]
378
+ out = subprocess.check_output(cmd).decode("utf-8")
379
+ data = json.loads(out)
380
+ dur = float(data["format"]["duration"])
381
+ return dur
382
+
383
+ def trim_video_to_seconds_ffmpeg(video_path: str, target_seconds: float, fps: float = None) -> str:
384
+ """
385
+ Trim video to [0, target_seconds]. Re-encode for accuracy & compatibility.
386
+ If fps is provided, also normalize fps.
387
+ Returns new temp mp4 path.
388
+ """
389
+ target_seconds = max(0.01, float(target_seconds))
390
+
391
+ out_path = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name
392
+
393
+ vf = []
394
+ if fps is not None:
395
+ vf.append(f"fps={float(fps)}")
396
+ vf_str = ",".join(vf) if vf else None
397
+
398
+ cmd = ["ffmpeg", "-y", "-v", "error"]
399
+
400
+ # Accurate trim: use -t and re-encode.
401
+ cmd += ["-i", video_path, "-t", f"{target_seconds:.6f}"]
402
+
403
+ if vf_str:
404
+ cmd += ["-vf", vf_str]
405
+
406
+ # Safe default encode
407
+ cmd += [
408
+ "-c:v", "libx264", "-pix_fmt", "yuv420p", "-preset", "veryfast", "-crf", "18",
409
+ "-an", # conditioning video doesn't need audio
410
+ out_path
411
+ ]
412
+
413
+ subprocess.check_call(cmd)
414
+ return out_path
415
+
416
+ def extract_first_frame_png(video_path: str) -> str:
417
+ """Extract first frame as png; returns png path."""
418
+ out_path = tempfile.NamedTemporaryFile(suffix=".png", delete=False).name
419
+ cmd = [
420
+ "ffmpeg", "-y", "-v", "error",
421
+ "-i", video_path,
422
+ "-frames:v", "1",
423
+ out_path
424
+ ]
425
+ subprocess.check_call(cmd)
426
+ return out_path
427
+
428
+ def _coerce_video_path(video_path: Any) -> str:
429
+ if isinstance(video_path, tuple) and len(video_path) > 0:
430
+ video_path = video_path[0]
431
+ if isinstance(video_path, dict):
432
+ video_path = video_path.get("name") or video_path.get("path")
433
+ if not isinstance(video_path, (str, bytes, os.PathLike)):
434
+ raise TypeError(f"video_path must be a path-like, got {type(video_path)}: {video_path}")
435
+ return os.fspath(video_path)
436
+
437
+
438
+ def prepare_conditioning_video_mp4(
439
+ video_path: Any,
440
+ target_num_frames: int,
441
+ target_fps: float,
442
+ ) -> tuple[str, str]:
443
+ """
444
+ Returns (conditioning_mp4_path, first_frame_png_path).
445
+
446
+ Makes an mp4 with exactly target_num_frames frames:
447
+ - if source has more -> truncate
448
+ - if source has fewer -> pad by repeating last frame
449
+ """
450
+ video_path = _coerce_video_path(video_path)
451
+
452
+ # Decode frames (robust / deterministic)
453
+ frames = load_video_frames(video_path) # list of HWC uint8 frames
454
+ if not frames:
455
+ raise ValueError("No frames decoded from input video")
456
+
457
+ # Truncate or pad to exact length
458
+ if len(frames) >= target_num_frames:
459
+ frames = frames[:target_num_frames]
460
+ else:
461
+ last = frames[-1]
462
+ frames = frames + [last] * (target_num_frames - len(frames))
463
+
464
+ # Save first frame as PNG (for input_image)
465
+ first_png = tempfile.NamedTemporaryFile(suffix=".png", delete=False).name
466
+ Image.fromarray(frames[0]).save(first_png)
467
+
468
+ # Write conditioning mp4
469
+ # write_video_mp4 expects float [0..1]
470
+ frames_float = [f.astype(np.float32) / 255.0 for f in frames]
471
+ cond_mp4 = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name
472
+ write_video_mp4(frames_float, fps=target_fps, out_path=cond_mp4)
473
+
474
+ return cond_mp4, first_png
475
+
476
+ def valid_1_plus_8k(n: int) -> int:
477
+ """Largest integer <= n that is of the form 1 + 8*k (k>=0)."""
478
+ if n <= 0:
479
+ return 0
480
+ return 1 + 8 * ((n - 1) // 8)
481
+
482
+ def prepare_conditioning_video_mp4_no_pad(
483
+ video_path: Any,
484
+ duration_frames: int,
485
+ target_fps: float,
486
+ ) -> tuple[str, str, int]:
487
+ """
488
+ Returns (conditioning_mp4_path, first_frame_png_path, used_num_frames)
489
+
490
+ - Decodes source frames
491
+ - Trims to the largest valid length (1 + 8*k) <= source length
492
+ - NEVER pads / loops / repeats last frame
493
+ """
494
+ video_path = _coerce_video_path(video_path)
495
+
496
+ frames = load_video_frames(video_path) # list of HWC uint8
497
+ if not frames:
498
+ raise ValueError("No frames decoded from input video")
499
+
500
+ n_src = len(frames)
501
+ n_src = min(n_src, duration_frames)
502
+ n_used = valid_1_plus_8k(n_src)
503
+
504
+ # If the video is extremely short (e.g. 1 frame), n_used can be 1 which is valid.
505
+ if n_used == 0:
506
+ raise ValueError(f"Video too short: {n_src} frames")
507
+
508
+ frames = frames[:n_used]
509
+
510
+ first_png = tempfile.NamedTemporaryFile(suffix=".png", delete=False).name
511
+ Image.fromarray(frames[0]).save(first_png)
512
+
513
+ frames_float = [f.astype(np.float32) / 255.0 for f in frames]
514
+ cond_mp4 = tempfile.NamedTemporaryFile(suffix=".mp4", delete=False).name
515
+ write_video_mp4(frames_float, fps=target_fps, out_path=cond_mp4)
516
+
517
+ return cond_mp4, first_png, n_used
518
+
519
 
520
  def encode_text_simple(text_encoder, prompt: str):
521
  """Simple text encoding without using pipeline_utils."""
 
659
  "ltx-2-19b-ic-lora-detailer.safetensors",
660
  )
661
 
662
+ pose_lora_path = get_hub_or_local_checkpoint(
663
+ "Lightricks/LTX-2-19b-IC-LoRA-Pose-Control",
664
+ "ltx-2-19b-ic-lora-pose-control.safetensors",
665
+ )
666
+
667
+
668
+
669
  # Load distilled LoRA as a regular LoRA
670
  loras = [
671
  # --- fused / base behavior ---
 
682
  LoraPathStrengthAndSDOps(dolly_right_lora_path, DEFAULT_LORA_STRENGTH, LTXV_LORA_COMFY_RENAMING_MAP),
683
  LoraPathStrengthAndSDOps(jib_down_lora_path, DEFAULT_LORA_STRENGTH, LTXV_LORA_COMFY_RENAMING_MAP),
684
  LoraPathStrengthAndSDOps(jib_up_lora_path, DEFAULT_LORA_STRENGTH, LTXV_LORA_COMFY_RENAMING_MAP),
685
+ LoraPathStrengthAndSDOps(pose_lora_path, DEFAULT_LORA_STRENGTH, LTXV_LORA_COMFY_RENAMING_MAP),
686
  ]
687
 
688
  # Runtime-toggle LoRAs (exclude fused distilled at index 0)
689
+ VISIBLE_RUNTIME_LORA_CHOICES = [
690
+ ("No LoRA", -1),
691
+ ("Static", 0),
692
+ ("Detailer", 1),
693
+ ("Zoom In", 2),
694
+ ("Zoom Out", 3),
695
+ ("Slide Left", 4),
696
+ ("Slide Right", 5),
697
+ ("Slide Down", 6),
698
+ ("Slide Up", 7),
699
+ ]
700
+
701
  RUNTIME_LORA_CHOICES = [
702
  ("No LoRA", -1),
703
  ("Static", 0),
 
708
  ("Slide Right", 5),
709
  ("Slide Down", 6),
710
  ("Slide Up", 7),
711
+ ("Pose", 8),
712
  ]
713
 
714
  # Initialize pipeline WITHOUT text encoder (gemma_root=None)
 
815
 
816
  // Recalc on resize (important in Gradio layouts)
817
  window.addEventListener('resize', () => setHighlightByIndex(currentIdx));
818
+
819
+ // sync from Python (Examples / backend updates)
820
+ let last = props.value;
821
+ const syncFromProps = () => {
822
+ if (props.value !== last) {
823
+ last = props.value;
824
+ setCheckedByValue(last, false);
825
+ }
826
+ requestAnimationFrame(syncFromProps);
827
+ };
828
+ requestAnimationFrame(syncFromProps);
829
+
830
  })();
831
 
832
  """
 
1089
  **kwargs
1090
  )
1091
 
1092
+ def generate_video_example(input_image, prompt, camera_lora, resolution, radioanimated_mode, input_video, input_audio, progress=gr.Progress(track_tqdm=True)):
1093
 
1094
  w, h = apply_resolution(resolution)
1095
 
1096
+ with timer(f'generating with video path:{input_video} with duration:{duration} and LoRA:{camera_lora} in {w}x{h}'):
1097
+ output_video = generate_video(
1098
+ input_image,
1099
+ prompt,
1100
+ 10,
1101
+ input_video,
1102
+ radioanimated_mode,
1103
+ True,
1104
+ 42,
1105
+ True,
1106
+ h,
1107
+ w,
1108
+ camera_lora,
1109
+ input_audio,
1110
+ progress
1111
+ )
1112
  return output_video
1113
 
1114
  def get_duration(
1115
  input_image,
1116
  prompt,
1117
  duration,
1118
+ input_video,
1119
+ radioanimated_mode,
1120
  enhance_prompt,
1121
  seed,
1122
  randomize_seed,
 
1131
  if audio_path is not None:
1132
  extra_time += 10
1133
 
1134
+ if input_video is not None:
1135
+ extra_time += 60
1136
+
1137
  if duration <= 3:
1138
  return 60 + extra_time
1139
  elif duration <= 5:
 
1143
  else:
1144
  return 180 + extra_time
1145
 
1146
+
1147
  @spaces.GPU(duration=get_duration)
1148
  def generate_video(
1149
  input_image,
1150
  prompt: str,
1151
  duration: float,
1152
+ input_video = None,
1153
+ generation_mode = "Image-to-Video",
1154
  enhance_prompt: bool = True,
1155
  seed: int = 42,
1156
  randomize_seed: bool = True,
 
1166
  input_image: Optional input image for image-to-video. If provided, it is injected at frame 0 to guide motion.
1167
  prompt: Text description of the scene, motion, and cinematic style to generate.
1168
  duration: Desired video length in seconds. Converted to frames using a fixed 24 FPS rate.
1169
+ input_video: Optional conditioning video path (mp4). If provided, motion is guided by this video.
1170
  enhance_prompt: Whether to enhance the prompt using the prompt enhancer before encoding.
1171
  seed: Base random seed for reproducibility (ignored if randomize_seed is True).
1172
  randomize_seed: If True, a random seed is generated for each run.
 
1202
  # Calculate num_frames from duration (using fixed 24 fps)
1203
  frame_rate = 24.0
1204
  num_frames = int(duration * frame_rate) + 1 # +1 to ensure we meet the duration
1205
+ video_seconds = int(duration)
1206
 
1207
  with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmpfile:
1208
  output_path = tmpfile.name
1209
 
1210
 
1211
  images = []
1212
+ videos = []
1213
+
1214
+ if generation_mode == "Motion Control":
1215
+ if input_video is not None:
1216
+ cond_mp4, first_png, used_frames = prepare_conditioning_video_mp4_no_pad(
1217
+ video_path=input_video,
1218
+ duration_frames=num_frames,
1219
+ target_fps=frame_rate,
1220
+ )
1221
+
1222
+ if input_image is None:
1223
+ images = [(first_png, 0, 1.0)]
1224
+
1225
+ if audio_path is None:
1226
+ src_video_path = _coerce_video_path(input_video)
1227
+ extracted_audio_tmp = extract_audio_wav_ffmpeg(src_video_path, target_sr=48000)
1228
+
1229
+ if extracted_audio_tmp is not None:
1230
+ audio_path = extracted_audio_tmp
1231
+
1232
+ with timer("Pose selected: preprocessing conditioning video to pose..."):
1233
+ cond_path = preprocess_video_to_pose_mp4(
1234
+ video_path=cond_mp4,
1235
+ width=width,
1236
+ height=height,
1237
+ fps=frame_rate,
1238
+ )
1239
+ videos = [(cond_path, 1.0)]
1240
+ camera_lora = "Pose"
1241
 
1242
  if input_image is not None:
1243
  images = [(input_image, 0, 1.0)]
 
1267
  _, n_audio_context = encode_text_simple(text_encoder, "") # returns tensors on GPU already
1268
  del audio_context
1269
  audio_context = n_audio_context
1270
+
1271
+ if len(videos) == 0:
1272
+ camera_lora = "Static"
1273
 
1274
  torch.cuda.empty_cache()
1275
 
 
1296
  input_waveform = None
1297
  input_waveform_sample_rate = None
1298
 
1299
+ with timer(f'generating with video path:{input_video} and LoRA:{camera_lora} in {width}x{height}'):
1300
+ with torch.inference_mode():
1301
+ pipeline(
1302
+ prompt=prompt,
1303
+ output_path=str(output_path),
1304
+ seed=current_seed,
1305
+ height=height,
1306
+ width=width,
1307
+ num_frames=num_frames,
1308
+ frame_rate=frame_rate,
1309
+ images=images,
1310
+ video_conditioning=videos,
1311
+ tiling_config=TilingConfig.default(),
1312
+ video_context=video_context,
1313
+ audio_context=audio_context,
1314
+ input_waveform=input_waveform,
1315
+ input_waveform_sample_rate=input_waveform_sample_rate,
1316
+ )
1317
  del video_context, audio_context
1318
  torch.cuda.empty_cache()
1319
  print("successful generation")
 
1337
  duration_s = int(duration[:-1])
1338
  return duration_s
1339
 
1340
+ def on_mode_change(selected: str):
1341
+ is_i2v = (selected == "Image-to-Video")
1342
+
1343
+ return gr.update(visible=not is_i2v)
1344
+
1345
+
1346
 
1347
  css = """
1348
 
 
1451
  #false:checked ~ .toggle-highlight {
1452
  transform: translateX(100%);
1453
  }
1454
+
1455
+ /* Center items inside that row */
1456
+ #mode-row{
1457
+ justify-content: center !important;
1458
+ align-items: center !important;
1459
+ }
1460
+
1461
+ /* Center the mode row contents */
1462
+ #mode-row {
1463
+ display: flex !important;
1464
+ justify-content: center !important;
1465
+ align-items: center !important;
1466
+ width: 100% !important;
1467
+ }
1468
+
1469
+ /* Stop Gradio from making children stretch */
1470
+ #mode-row > * {
1471
+ flex: 0 0 auto !important;
1472
+ width: auto !important;
1473
+ min-width: 0 !important;
1474
+ }
1475
+
1476
+ /* Specifically ensure the HTML component wrapper doesn't take full width */
1477
+ #mode-row .gr-html,
1478
+ #mode-row .gradio-html,
1479
+ #mode-row .prose,
1480
+ #mode-row .block {
1481
+ width: auto !important;
1482
+ flex: 0 0 auto !important;
1483
+ display: inline-block !important;
1484
+ }
1485
+
1486
+ /* Center the pill itself */
1487
+ #radioanimated_mode {
1488
+ display: inline-flex !important;
1489
+ justify-content: center !important;
1490
+ width: auto !important;
1491
+ }
1492
+
1493
  """
1494
 
1495
  css += """
 
2038
  """
2039
  )
2040
  with gr.Column(elem_id="col-container"):
2041
+ with gr.Row(elem_id="mode-row"):
2042
+ radioanimated_mode = RadioAnimated(
2043
+ choices=["Image-to-Video", "Motion Control"],
2044
+ value="Image-to-Video",
2045
+ elem_id="radioanimated_mode"
2046
+ )
2047
  with gr.Row():
2048
  with gr.Column(elem_id="step-column"):
2049
+
2050
  input_image = gr.Image(
2051
  label="First Frame (Optional)",
2052
  type="filepath",
2053
  height=256
2054
  )
2055
 
2056
+ input_video = gr.Video(
2057
+ label="Motion Reference Video",
2058
+ height=256,
2059
+ visible=False,
2060
+ )
2061
+
2062
  relocate = gr.HTML(
2063
  value="",
2064
  html_template="<div></div>",
 
2129
 
2130
  with gr.Column(elem_id="step-column"):
2131
  output_video = gr.Video(label="Generated Video", autoplay=True, height=512)
2132
+
2133
  with gr.Row(elem_id="controls-row"):
2134
 
2135
  duration_ui = CameraDropdown(
 
2177
  height = gr.Number(label="Height", value=DEFAULT_1_STAGE_HEIGHT, precision=0, visible=False)
2178
 
2179
  camera_ui = CameraDropdown(
2180
+ choices=[name for name, _ in VISIBLE_RUNTIME_LORA_CHOICES],
2181
  value="No LoRA",
2182
  title="Camera LoRA",
2183
  elem_id="camera_ui",
 
2186
  # Hidden real dropdown (backend value)
2187
  camera_lora = gr.Dropdown(
2188
  label="Camera Control LoRA",
2189
+ choices=[name for name, _ in VISIBLE_RUNTIME_LORA_CHOICES],
2190
  value="No LoRA",
2191
  visible=False
2192
  )
 
2200
  api_visibility="private"
2201
  )
2202
 
2203
+ radioanimated_mode.change(
2204
+ fn=on_mode_change,
2205
+ inputs=radioanimated_mode,
2206
+ outputs=[input_video],
2207
+ api_visibility="private",
2208
+ )
2209
+
2210
+
2211
  duration_ui.change(
2212
  fn=apply_duration,
2213
  inputs=duration_ui,
 
2234
  input_image,
2235
  prompt,
2236
  duration,
2237
+ input_video,
2238
+ radioanimated_mode,
2239
  enhance_prompt,
2240
  seed,
2241
  randomize_seed,
 
2255
  "A fuzzy puppet superhero character resembling a female puppet with blonde hair and a blue superhero suit sleeping in bed and just waking up, she gradually gets up, rubbing her eyes and looking at her dog that just popped on the bed. the scene feels chaotic, comedic, and emotional with expressive puppet reactions, cinematic lighting, smooth camera motion, shallow depth of field, and high-quality puppet-style animation",
2256
  "Static",
2257
  "16:9",
2258
+ "Image-to-Video",
2259
+ None,
2260
  "supergirl.m4a"
2261
  ],
2262
  [
 
2264
  "A fuzzy puppet superhero character resembling a female puppet with blonde hair and a blue superhero suit stands inside an icy cave made of frozen walls and icicles, she looks panicked and frantic, rapidly turning her head left and right and scanning the cave while waving her arms and shouting angrily and desperately, mouthing the words “where the hell is my dog,” her movements exaggerated and puppet-like with high energy and urgency, suddenly a second puppet dog bursts into frame from the side, jumping up excitedly and tackling her affectionately while licking her face repeatedly, she freezes in surprise and then breaks into relief and laughter as the dog continues licking her, the scene feels chaotic, comedic, and emotional with expressive puppet reactions, cinematic lighting, smooth camera motion, shallow depth of field, and high-quality puppet-style animation",
2265
  "No LoRA",
2266
  "16:9",
2267
+ "Image-to-Video",
2268
+ None,
2269
+ None,
2270
+ ],
2271
+ [
2272
+ "clay.png",
2273
+ "a character doing a tiktok dance by moving their heads side to side with dramatic lighting and cinematic effects and singing",
2274
+ "Pose",
2275
+ "9:16",
2276
+ "Motion Control",
2277
+ "tiktok.mp4",
2278
  None,
2279
  ],
2280
+ [
2281
+ "paint.png",
2282
+ "a character doing a tiktok dance by moving their heads side to side with dramatic lighting and cinematic effects and singing",
2283
+ "Pose",
2284
+ "9:16",
2285
+ "Motion Control",
2286
+ "tiktok.mp4",
2287
+ None,
2288
+ ],
2289
  [
2290
  "highland.png",
2291
  "Realistic POV selfie-style video in a snowy, foggy field. Two shaggy Highland cows with long curved horns stand ahead. The camera is handheld and slightly shaky. The woman filming talks nervously and excitedly in a vlog tone: \"Oh my god guys… look how big those horns are… I’m kinda scared.\" The cow on the left walks toward the camera in a cute, bouncy, hopping way, curious and gentle. Snow crunches under its hooves, breath visible in the cold air. The horns look massive from the POV. As the cow gets very close, its wet nose with slight dripping fills part of the frame. She laughs nervously but reaches out and pets the cow. The cow makes deep, soft, interesting mooing and snorting sounds, calm and friendly. Ultra-realistic, natural lighting, immersive audio, documentary-style realism.",
2292
  "No LoRA",
2293
  "16:9",
2294
+ "Image-to-Video",
2295
+ None,
2296
  None,
2297
  ],
2298
  [
 
2300
  "A cinematic dolly out of Wednesday Addams frozen mid-dance on a dark, blue-lit ballroom floor as students move indistinctly behind her, their footsteps and muffled music reduced to a distant, underwater thrum; the audio foregrounds her steady breathing and the faint rustle of fabric as she slowly raises one arm, never breaking eye contact with the camera, then after a deliberately long silence she speaks in a flat, dry, perfectly controlled voice, “I don’t dance… I vibe code,” each word crisp and unemotional, followed by an abrupt cutoff of her voice as the background sound swells slightly, reinforcing the deadpan humor, with precise lip sync, minimal facial movement, stark gothic lighting, and cinematic realism.",
2301
  "Zoom Out",
2302
  "16:9",
2303
+ "Image-to-Video",
2304
+ None,
2305
  None,
2306
  ],
2307
  [
 
2309
  "An astronaut hatches from a fragile egg on the surface of the Moon, the shell cracking and peeling apart in gentle low-gravity motion. Fine lunar dust lifts and drifts outward with each movement, floating in slow arcs before settling back onto the ground. The astronaut pushes free in a deliberate, weightless motion, small fragments of the egg tumbling and spinning through the air. In the background, the deep darkness of space subtly shifts as stars glide with the camera's movement, emphasizing vast depth and scale. The camera performs a smooth, cinematic slow push-in, with natural parallax between the foreground dust, the astronaut, and the distant starfield. Ultra-realistic detail, physically accurate low-gravity motion, cinematic lighting, and a breath-taking, movie-like shot.",
2310
  "Static",
2311
  "1:1",
2312
+ "Image-to-Video",
2313
+ None,
2314
  None,
2315
  ],
 
 
2316
  ],
2317
  fn=generate_video_example,
2318
+ inputs=[input_image, prompt_ui, camera_ui, resolution_ui, radioanimated_mode, input_video, audio_input],
2319
  outputs = [output_video],
2320
+ label="Examples",
2321
  cache_examples=True,
2322
  )
2323