linoyts HF Staff commited on
Commit
061c241
·
1 Parent(s): 979ef85

Update app.py (#1)

Browse files

- Update app.py (1fc59c70a8f8f706dcb741b2777c73234eefbd75)

Files changed (1) hide show
  1. app.py +218 -42
app.py CHANGED
@@ -24,19 +24,10 @@ subprocess.run(
24
  )
25
 
26
  print("Installing ltx-core and ltx-pipelines from cloned repo...")
27
- # subprocess.run(
28
- # [sys.executable, "-m", "pip", "install", "--force-reinstall", "--no-deps", "-e",
29
- # os.path.join(LTX_REPO_DIR, "packages", "ltx-core"),
30
- # "-e", os.path.join(LTX_REPO_DIR, "packages", "ltx-pipelines")],
31
- # check=True,
32
- # )
33
  subprocess.run(
34
- [
35
- sys.executable, "-m", "pip", "install",
36
- "--force-reinstall",
37
- "-e", os.path.join(LTX_REPO_DIR, "packages", "ltx-core") + "[fp8-trtllm]",
38
- "-e", os.path.join(LTX_REPO_DIR, "packages", "ltx-pipelines"),
39
- ],
40
  check=True,
41
  )
42
 
@@ -57,13 +48,19 @@ import gradio as gr
57
  import numpy as np
58
  from huggingface_hub import hf_hub_download, snapshot_download
59
 
60
- from ltx_core.loader import LoraPathStrengthAndSDOps
61
- from ltx_core.loader.sd_ops import LTXV_LORA_COMFY_RENAMING_MAP
62
  from ltx_core.model.video_vae import TilingConfig, get_video_chunks_number
63
  from ltx_core.quantization import QuantizationPolicy
 
 
 
 
 
 
64
  from ltx_pipelines.ic_lora import ICLoraPipeline
 
65
  from ltx_pipelines.utils.args import ImageConditioningInput
66
- from ltx_pipelines.utils.media_io import encode_video
 
67
 
68
  # Force-patch xformers attention into the LTX attention module.
69
  from ltx_core.model.transformer import attention as _attn_mod
@@ -113,42 +110,217 @@ checkpoint_path = hf_hub_download(repo_id="linoyts/ltx-2.3-22b-distilled-motion-
113
  spatial_upsampler_path = hf_hub_download(repo_id=LTX_MODEL_REPO, filename="ltx-2.3-spatial-upscaler-x2-1.0.safetensors")
114
  gemma_root = snapshot_download(repo_id=GEMMA_REPO)
115
 
116
- # Pre-download all IC-LoRA checkpoints
117
- ic_lora_paths = {}
118
- for name, info in IC_LORA_OPTIONS.items():
119
- path = hf_hub_download(repo_id=info["repo"], filename=info["filename"])
120
- ic_lora_paths[name] = path
121
- print(f"IC-LoRA '{name}': {path}")
122
-
123
  print(f"Checkpoint: {checkpoint_path}")
124
  print(f"Spatial upsampler: {spatial_upsampler_path}")
125
  print(f"Gemma root: {gemma_root}")
126
 
127
  # Build initial pipeline with the first IC-LoRA
128
  default_lora_name = "Union Control (Depth + Canny)"
129
- default_lora_path = ic_lora_paths[default_lora_name]
130
-
131
  current_pipeline = None
132
  current_lora_name = None
133
 
134
 
135
- def build_pipeline(lora_name: str) -> ICLoraPipeline:
136
- """Build an ICLoraPipeline with the given IC-LoRA."""
137
- lora_path = ic_lora_paths[lora_name]
138
- lora = LoraPathStrengthAndSDOps(
139
- path=lora_path,
140
- strength=1.0,
141
- sd_ops=LTXV_LORA_COMFY_RENAMING_MAP,
142
- )
143
- pipe = ICLoraPipeline(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
  distilled_checkpoint_path=checkpoint_path,
145
  spatial_upsampler_path=spatial_upsampler_path,
146
  gemma_root=gemma_root,
147
- # loras=[lora],
148
  loras=[],
149
  quantization=QuantizationPolicy.fp8_cast(),
150
- #quantization=QuantizationPolicy.fp8_scaled_mm()
151
-
152
  )
153
  return pipe
154
 
@@ -273,6 +445,7 @@ def on_highres_toggle(input_image, high_res):
273
  def generate_video(
274
  input_image,
275
  conditioning_video,
 
276
  prompt: str,
277
  duration: float,
278
  ic_lora_choice: str,
@@ -306,6 +479,8 @@ def generate_video(
306
 
307
  print(f"Generating: {height}x{width}, {num_frames} frames ({duration}s), seed={current_seed}")
308
  print(f"IC-LoRA: {ic_lora_choice}, conditioning_strength: {conditioning_strength}")
 
 
309
 
310
  output_dir = Path("outputs")
311
  output_dir.mkdir(exist_ok=True)
@@ -341,6 +516,7 @@ def generate_video(
341
  frame_rate=frame_rate,
342
  images=images,
343
  video_conditioning=video_conditioning,
 
344
  tiling_config=tiling_config,
345
  enhance_prompt=enhance_prompt,
346
  conditioning_attention_strength=1.0,
@@ -371,12 +547,11 @@ def generate_video(
371
 
372
 
373
  with gr.Blocks(title="LTX-2.3 IC-LoRA") as demo:
374
- gr.Markdown("# LTX-2.3 IC-LoRA: Video-to-Video Control")
375
  gr.Markdown(
376
- "Video-to-video transformations using IC-LoRA conditioning "
377
- "(depth + canny union control, motion tracking). Upload a **conditioning video** "
378
- "as the IC-LoRA reference signal, optionally provide an input image for I2V, "
379
- "and describe the desired output. "
380
  "[[model]](https://huggingface.co/Lightricks/LTX-2.3) "
381
  "[[code]](https://github.com/Lightricks/LTX-2)"
382
  )
@@ -388,6 +563,7 @@ with gr.Blocks(title="LTX-2.3 IC-LoRA") as demo:
388
  sources=["upload"],
389
  )
390
  input_image = gr.Image(label="Input Image (Optional)", type="pil")
 
391
  prompt = gr.Textbox(
392
  label="Prompt",
393
  info="Describe the desired output — the IC-LoRA controls structure from the reference",
@@ -441,7 +617,7 @@ with gr.Blocks(title="LTX-2.3 IC-LoRA") as demo:
441
  fn=generate_video,
442
  inputs=[
443
  input_image, conditioning_video,
444
- prompt, duration, ic_lora_choice, conditioning_strength,
445
  enhance_prompt, skip_stage_2,
446
  seed, randomize_seed, height, width,
447
  ],
 
24
  )
25
 
26
  print("Installing ltx-core and ltx-pipelines from cloned repo...")
 
 
 
 
 
 
27
  subprocess.run(
28
+ [sys.executable, "-m", "pip", "install", "--force-reinstall", "--no-deps", "-e",
29
+ os.path.join(LTX_REPO_DIR, "packages", "ltx-core"),
30
+ "-e", os.path.join(LTX_REPO_DIR, "packages", "ltx-pipelines")],
 
 
 
31
  check=True,
32
  )
33
 
 
48
  import numpy as np
49
  from huggingface_hub import hf_hub_download, snapshot_download
50
 
 
 
51
  from ltx_core.model.video_vae import TilingConfig, get_video_chunks_number
52
  from ltx_core.quantization import QuantizationPolicy
53
+ from ltx_core.components.diffusion_steps import EulerDiffusionStep
54
+ from ltx_core.components.noisers import GaussianNoiser
55
+ from ltx_core.model.audio_vae import encode_audio as vae_encode_audio
56
+ from ltx_core.model.upsampler import upsample_video
57
+ from ltx_core.model.video_vae import decode_video as vae_decode_video
58
+ from ltx_core.types import Audio, AudioLatentShape, VideoPixelShape
59
  from ltx_pipelines.ic_lora import ICLoraPipeline
60
+ from ltx_pipelines.utils import cleanup_memory, denoise_audio_video, encode_prompts, euler_denoising_loop, simple_denoising_func
61
  from ltx_pipelines.utils.args import ImageConditioningInput
62
+ from ltx_pipelines.utils.constants import DISTILLED_SIGMA_VALUES, STAGE_2_DISTILLED_SIGMA_VALUES
63
+ from ltx_pipelines.utils.media_io import decode_audio_from_file, encode_video
64
 
65
  # Force-patch xformers attention into the LTX attention module.
66
  from ltx_core.model.transformer import attention as _attn_mod
 
110
  spatial_upsampler_path = hf_hub_download(repo_id=LTX_MODEL_REPO, filename="ltx-2.3-spatial-upscaler-x2-1.0.safetensors")
111
  gemma_root = snapshot_download(repo_id=GEMMA_REPO)
112
 
 
 
 
 
 
 
 
113
  print(f"Checkpoint: {checkpoint_path}")
114
  print(f"Spatial upsampler: {spatial_upsampler_path}")
115
  print(f"Gemma root: {gemma_root}")
116
 
117
  # Build initial pipeline with the first IC-LoRA
118
  default_lora_name = "Union Control (Depth + Canny)"
 
 
119
  current_pipeline = None
120
  current_lora_name = None
121
 
122
 
123
+ class AudioConditionedICLoraPipeline(ICLoraPipeline):
124
+ """IC-LoRA pipeline with optional audio conditioning, adapted from multimodalart's audio-input Space."""
125
+
126
+ def __call__(
127
+ self,
128
+ prompt: str,
129
+ seed: int,
130
+ height: int,
131
+ width: int,
132
+ num_frames: int,
133
+ frame_rate: float,
134
+ images: list[ImageConditioningInput],
135
+ video_conditioning: list[tuple[str, float]],
136
+ audio_path: str | None = None,
137
+ enhance_prompt: bool = False,
138
+ tiling_config: TilingConfig | None = None,
139
+ conditioning_attention_strength: float = 1.0,
140
+ skip_stage_2: bool = False,
141
+ conditioning_attention_mask: torch.Tensor | None = None,
142
+ ):
143
+ if audio_path is None:
144
+ return super().__call__(
145
+ prompt=prompt,
146
+ seed=seed,
147
+ height=height,
148
+ width=width,
149
+ num_frames=num_frames,
150
+ frame_rate=frame_rate,
151
+ images=images,
152
+ video_conditioning=video_conditioning,
153
+ enhance_prompt=enhance_prompt,
154
+ tiling_config=tiling_config,
155
+ conditioning_attention_strength=conditioning_attention_strength,
156
+ skip_stage_2=skip_stage_2,
157
+ conditioning_attention_mask=conditioning_attention_mask,
158
+ )
159
+
160
+ generator = torch.Generator(device=self.device).manual_seed(seed)
161
+ noiser = GaussianNoiser(generator=generator)
162
+ stepper = EulerDiffusionStep()
163
+ dtype = torch.bfloat16
164
+
165
+ (ctx_p,) = encode_prompts(
166
+ [prompt],
167
+ self.stage_1_model_ledger,
168
+ enhance_first_prompt=enhance_prompt,
169
+ enhance_prompt_image=images[0].path if len(images) > 0 else None,
170
+ enhance_prompt_seed=seed,
171
+ )
172
+ video_context, audio_context = ctx_p.video_encoding, ctx_p.audio_encoding
173
+
174
+ video_duration = num_frames / frame_rate
175
+ decoded_audio = decode_audio_from_file(audio_path, self.device, 0.0, video_duration)
176
+ if decoded_audio is None:
177
+ raise ValueError(f"Could not extract audio stream from {audio_path}")
178
+
179
+ encoded_audio_latent = vae_encode_audio(decoded_audio, self.stage_1_model_ledger.audio_encoder())
180
+ audio_shape = AudioLatentShape.from_duration(batch=1, duration=video_duration, channels=8, mel_bins=16)
181
+ expected_frames = audio_shape.frames
182
+ actual_frames = encoded_audio_latent.shape[2]
183
+
184
+ if actual_frames > expected_frames:
185
+ encoded_audio_latent = encoded_audio_latent[:, :, :expected_frames, :]
186
+ elif actual_frames < expected_frames:
187
+ pad = torch.zeros(
188
+ encoded_audio_latent.shape[0],
189
+ encoded_audio_latent.shape[1],
190
+ expected_frames - actual_frames,
191
+ encoded_audio_latent.shape[3],
192
+ device=encoded_audio_latent.device,
193
+ dtype=encoded_audio_latent.dtype,
194
+ )
195
+ encoded_audio_latent = torch.cat([encoded_audio_latent, pad], dim=2)
196
+
197
+ stage_1_output_shape = VideoPixelShape(batch=1, frames=num_frames, width=width // 2, height=height // 2, fps=frame_rate)
198
+ video_encoder = self.stage_1_model_ledger.video_encoder()
199
+ stage_1_conditionings = self._create_conditionings(
200
+ images=images,
201
+ video_conditioning=video_conditioning,
202
+ height=stage_1_output_shape.height,
203
+ width=stage_1_output_shape.width,
204
+ video_encoder=video_encoder,
205
+ num_frames=num_frames,
206
+ conditioning_attention_strength=conditioning_attention_strength,
207
+ conditioning_attention_mask=conditioning_attention_mask,
208
+ )
209
+
210
+ transformer = self.stage_1_model_ledger.transformer()
211
+ stage_1_sigmas = torch.Tensor(DISTILLED_SIGMA_VALUES).to(self.device)
212
+
213
+ def first_stage_denoising_loop(sigmas, video_state, audio_state, stepper):
214
+ return euler_denoising_loop(
215
+ sigmas=sigmas,
216
+ video_state=video_state,
217
+ audio_state=audio_state,
218
+ stepper=stepper,
219
+ denoise_fn=simple_denoising_func(
220
+ video_context=video_context,
221
+ audio_context=audio_context,
222
+ transformer=transformer,
223
+ ),
224
+ )
225
+
226
+ video_state, audio_state = denoise_audio_video(
227
+ output_shape=stage_1_output_shape,
228
+ conditionings=stage_1_conditionings,
229
+ noiser=noiser,
230
+ sigmas=stage_1_sigmas,
231
+ stepper=stepper,
232
+ denoising_loop_fn=first_stage_denoising_loop,
233
+ components=self.pipeline_components,
234
+ dtype=dtype,
235
+ device=self.device,
236
+ initial_audio_latent=encoded_audio_latent,
237
+ )
238
+
239
+ torch.cuda.synchronize()
240
+ del transformer
241
+ cleanup_memory()
242
+
243
+ if skip_stage_2:
244
+ decoded_video = vae_decode_video(
245
+ video_state.latent, self.stage_1_model_ledger.video_decoder(), tiling_config, generator
246
+ )
247
+ original_audio = Audio(waveform=decoded_audio.waveform.squeeze(0), sampling_rate=decoded_audio.sampling_rate)
248
+ del video_encoder
249
+ cleanup_memory()
250
+ return decoded_video, original_audio
251
+
252
+ upscaled_video_latent = upsample_video(
253
+ latent=video_state.latent[:1],
254
+ video_encoder=video_encoder,
255
+ upsampler=self.stage_2_model_ledger.spatial_upsampler(),
256
+ )
257
+
258
+ torch.cuda.synchronize()
259
+ cleanup_memory()
260
+
261
+ transformer = self.stage_2_model_ledger.transformer()
262
+ stage_2_sigmas = torch.Tensor(STAGE_2_DISTILLED_SIGMA_VALUES).to(self.device)
263
+
264
+ def second_stage_denoising_loop(sigmas, video_state, audio_state, stepper):
265
+ return euler_denoising_loop(
266
+ sigmas=sigmas,
267
+ video_state=video_state,
268
+ audio_state=audio_state,
269
+ stepper=stepper,
270
+ denoise_fn=simple_denoising_func(
271
+ video_context=video_context,
272
+ audio_context=audio_context,
273
+ transformer=transformer,
274
+ ),
275
+ )
276
+
277
+ stage_2_output_shape = VideoPixelShape(batch=1, frames=num_frames, width=width, height=height, fps=frame_rate)
278
+ stage_2_conditionings = self._create_conditionings(
279
+ images=images,
280
+ video_conditioning=video_conditioning,
281
+ height=stage_2_output_shape.height,
282
+ width=stage_2_output_shape.width,
283
+ video_encoder=video_encoder,
284
+ num_frames=num_frames,
285
+ conditioning_attention_strength=conditioning_attention_strength,
286
+ conditioning_attention_mask=conditioning_attention_mask,
287
+ )
288
+
289
+ video_state, audio_state = denoise_audio_video(
290
+ output_shape=stage_2_output_shape,
291
+ conditionings=stage_2_conditionings,
292
+ noiser=noiser,
293
+ sigmas=stage_2_sigmas,
294
+ stepper=stepper,
295
+ denoising_loop_fn=second_stage_denoising_loop,
296
+ components=self.pipeline_components,
297
+ dtype=dtype,
298
+ device=self.device,
299
+ noise_scale=stage_2_sigmas[0],
300
+ initial_video_latent=upscaled_video_latent,
301
+ initial_audio_latent=encoded_audio_latent,
302
+ )
303
+
304
+ torch.cuda.synchronize()
305
+ del transformer
306
+ del video_encoder
307
+ cleanup_memory()
308
+
309
+ decoded_video = vae_decode_video(
310
+ video_state.latent, self.stage_2_model_ledger.video_decoder(), tiling_config, generator
311
+ )
312
+ original_audio = Audio(waveform=decoded_audio.waveform.squeeze(0), sampling_rate=decoded_audio.sampling_rate)
313
+ return decoded_video, original_audio
314
+
315
+
316
+ def build_pipeline(lora_name: str) -> AudioConditionedICLoraPipeline:
317
+ """Build the fused IC-LoRA pipeline with optional audio conditioning."""
318
+ pipe = AudioConditionedICLoraPipeline(
319
  distilled_checkpoint_path=checkpoint_path,
320
  spatial_upsampler_path=spatial_upsampler_path,
321
  gemma_root=gemma_root,
 
322
  loras=[],
323
  quantization=QuantizationPolicy.fp8_cast(),
 
 
324
  )
325
  return pipe
326
 
 
445
  def generate_video(
446
  input_image,
447
  conditioning_video,
448
+ input_audio,
449
  prompt: str,
450
  duration: float,
451
  ic_lora_choice: str,
 
479
 
480
  print(f"Generating: {height}x{width}, {num_frames} frames ({duration}s), seed={current_seed}")
481
  print(f"IC-LoRA: {ic_lora_choice}, conditioning_strength: {conditioning_strength}")
482
+ if input_audio is not None:
483
+ print(f"Audio conditioning: {input_audio}")
484
 
485
  output_dir = Path("outputs")
486
  output_dir.mkdir(exist_ok=True)
 
516
  frame_rate=frame_rate,
517
  images=images,
518
  video_conditioning=video_conditioning,
519
+ audio_path=input_audio,
520
  tiling_config=tiling_config,
521
  enhance_prompt=enhance_prompt,
522
  conditioning_attention_strength=1.0,
 
547
 
548
 
549
  with gr.Blocks(title="LTX-2.3 IC-LoRA") as demo:
550
+ gr.Markdown("# LTX-2.3 IC-LoRA: Video-to-Video + Audio Conditioning")
551
  gr.Markdown(
552
+ "Video-to-video transformations using IC-LoRA conditioning with optional audio-driven generation. "
553
+ "Upload a **conditioning video** as the IC-LoRA reference signal, optionally add an **input audio** file "
554
+ "to preserve soundtrack or lip-sync timing, optionally provide an input image for I2V, and describe the desired output. "
 
555
  "[[model]](https://huggingface.co/Lightricks/LTX-2.3) "
556
  "[[code]](https://github.com/Lightricks/LTX-2)"
557
  )
 
563
  sources=["upload"],
564
  )
565
  input_image = gr.Image(label="Input Image (Optional)", type="pil")
566
+ input_audio = gr.Audio(label="Input Audio (Optional)", type="filepath", sources=["upload"])
567
  prompt = gr.Textbox(
568
  label="Prompt",
569
  info="Describe the desired output — the IC-LoRA controls structure from the reference",
 
617
  fn=generate_video,
618
  inputs=[
619
  input_image, conditioning_video,
620
+ input_audio, prompt, duration, ic_lora_choice, conditioning_strength,
621
  enhance_prompt, skip_stage_2,
622
  seed, randomize_seed, height, width,
623
  ],