LTX-2-3-sync

Paused

App Files Files Community

linoyts HF Staff commited on Mar 9

Commit

061c241

1 Parent(s): 979ef85

Update app.py (#1)

Browse files

- Update app.py (1fc59c70a8f8f706dcb741b2777c73234eefbd75)

Files changed (1) hide show

app.py +218 -42

app.py CHANGED Viewed

@@ -24,19 +24,10 @@ subprocess.run(
 )
 print("Installing ltx-core and ltx-pipelines from cloned repo...")
-# subprocess.run(
-#     [sys.executable, "-m", "pip", "install", "--force-reinstall", "--no-deps", "-e",
-#      os.path.join(LTX_REPO_DIR, "packages", "ltx-core"),
-#      "-e", os.path.join(LTX_REPO_DIR, "packages", "ltx-pipelines")],
-#     check=True,
-# )
 subprocess.run(
-    [
-        sys.executable, "-m", "pip", "install",
-        "--force-reinstall",
-        "-e", os.path.join(LTX_REPO_DIR, "packages", "ltx-core") + "[fp8-trtllm]",
-        "-e", os.path.join(LTX_REPO_DIR, "packages", "ltx-pipelines"),
-    ],
     check=True,
 )
@@ -57,13 +48,19 @@ import gradio as gr
 import numpy as np
 from huggingface_hub import hf_hub_download, snapshot_download
-from ltx_core.loader import LoraPathStrengthAndSDOps
-from ltx_core.loader.sd_ops import LTXV_LORA_COMFY_RENAMING_MAP
 from ltx_core.model.video_vae import TilingConfig, get_video_chunks_number
 from ltx_core.quantization import QuantizationPolicy
 from ltx_pipelines.ic_lora import ICLoraPipeline
 from ltx_pipelines.utils.args import ImageConditioningInput
-from ltx_pipelines.utils.media_io import encode_video
 # Force-patch xformers attention into the LTX attention module.
 from ltx_core.model.transformer import attention as _attn_mod
@@ -113,42 +110,217 @@ checkpoint_path = hf_hub_download(repo_id="linoyts/ltx-2.3-22b-distilled-motion-
 spatial_upsampler_path = hf_hub_download(repo_id=LTX_MODEL_REPO, filename="ltx-2.3-spatial-upscaler-x2-1.0.safetensors")
 gemma_root = snapshot_download(repo_id=GEMMA_REPO)
-# Pre-download all IC-LoRA checkpoints
-ic_lora_paths = {}
-for name, info in IC_LORA_OPTIONS.items():
-    path = hf_hub_download(repo_id=info["repo"], filename=info["filename"])
-    ic_lora_paths[name] = path
-    print(f"IC-LoRA '{name}': {path}")
 print(f"Checkpoint: {checkpoint_path}")
 print(f"Spatial upsampler: {spatial_upsampler_path}")
 print(f"Gemma root: {gemma_root}")
 # Build initial pipeline with the first IC-LoRA
 default_lora_name = "Union Control (Depth + Canny)"
-default_lora_path = ic_lora_paths[default_lora_name]
 current_pipeline = None
 current_lora_name = None
-def build_pipeline(lora_name: str) -> ICLoraPipeline:
-    """Build an ICLoraPipeline with the given IC-LoRA."""
-    lora_path = ic_lora_paths[lora_name]
-    lora = LoraPathStrengthAndSDOps(
-        path=lora_path,
-        strength=1.0,
-        sd_ops=LTXV_LORA_COMFY_RENAMING_MAP,
-    )
-    pipe = ICLoraPipeline(
         distilled_checkpoint_path=checkpoint_path,
         spatial_upsampler_path=spatial_upsampler_path,
         gemma_root=gemma_root,
-        # loras=[lora],
         loras=[],
         quantization=QuantizationPolicy.fp8_cast(),
-        #quantization=QuantizationPolicy.fp8_scaled_mm()
     )
     return pipe
@@ -273,6 +445,7 @@ def on_highres_toggle(input_image, high_res):
 def generate_video(
     input_image,
     conditioning_video,
     prompt: str,
     duration: float,
     ic_lora_choice: str,
@@ -306,6 +479,8 @@ def generate_video(
         print(f"Generating: {height}x{width}, {num_frames} frames ({duration}s), seed={current_seed}")
         print(f"IC-LoRA: {ic_lora_choice}, conditioning_strength: {conditioning_strength}")
         output_dir = Path("outputs")
         output_dir.mkdir(exist_ok=True)
@@ -341,6 +516,7 @@ def generate_video(
             frame_rate=frame_rate,
             images=images,
             video_conditioning=video_conditioning,
             tiling_config=tiling_config,
             enhance_prompt=enhance_prompt,
             conditioning_attention_strength=1.0,
@@ -371,12 +547,11 @@ def generate_video(
 with gr.Blocks(title="LTX-2.3 IC-LoRA") as demo:
-    gr.Markdown("# LTX-2.3 IC-LoRA: Video-to-Video Control")
     gr.Markdown(
-        "Video-to-video transformations using IC-LoRA conditioning "
-        "(depth + canny union control, motion tracking). Upload a **conditioning video** "
-        "as the IC-LoRA reference signal, optionally provide an input image for I2V, "
-        "and describe the desired output. "
         "[[model]](https://huggingface.co/Lightricks/LTX-2.3) "
         "[[code]](https://github.com/Lightricks/LTX-2)"
     )
@@ -388,6 +563,7 @@ with gr.Blocks(title="LTX-2.3 IC-LoRA") as demo:
                 sources=["upload"],
             )
             input_image = gr.Image(label="Input Image (Optional)", type="pil")
             prompt = gr.Textbox(
                 label="Prompt",
                 info="Describe the desired output — the IC-LoRA controls structure from the reference",
@@ -441,7 +617,7 @@ with gr.Blocks(title="LTX-2.3 IC-LoRA") as demo:
         fn=generate_video,
         inputs=[
             input_image, conditioning_video,
-            prompt, duration, ic_lora_choice, conditioning_strength,
             enhance_prompt, skip_stage_2,
             seed, randomize_seed, height, width,
         ],

 )
 print("Installing ltx-core and ltx-pipelines from cloned repo...")
 subprocess.run(
+    [sys.executable, "-m", "pip", "install", "--force-reinstall", "--no-deps", "-e",
+     os.path.join(LTX_REPO_DIR, "packages", "ltx-core"),
+     "-e", os.path.join(LTX_REPO_DIR, "packages", "ltx-pipelines")],
     check=True,
 )
 import numpy as np
 from huggingface_hub import hf_hub_download, snapshot_download
 from ltx_core.model.video_vae import TilingConfig, get_video_chunks_number
 from ltx_core.quantization import QuantizationPolicy
+from ltx_core.components.diffusion_steps import EulerDiffusionStep
+from ltx_core.components.noisers import GaussianNoiser
+from ltx_core.model.audio_vae import encode_audio as vae_encode_audio
+from ltx_core.model.upsampler import upsample_video
+from ltx_core.model.video_vae import decode_video as vae_decode_video
+from ltx_core.types import Audio, AudioLatentShape, VideoPixelShape
 from ltx_pipelines.ic_lora import ICLoraPipeline
+from ltx_pipelines.utils import cleanup_memory, denoise_audio_video, encode_prompts, euler_denoising_loop, simple_denoising_func
 from ltx_pipelines.utils.args import ImageConditioningInput
+from ltx_pipelines.utils.constants import DISTILLED_SIGMA_VALUES, STAGE_2_DISTILLED_SIGMA_VALUES
+from ltx_pipelines.utils.media_io import decode_audio_from_file, encode_video
 # Force-patch xformers attention into the LTX attention module.
 from ltx_core.model.transformer import attention as _attn_mod
 spatial_upsampler_path = hf_hub_download(repo_id=LTX_MODEL_REPO, filename="ltx-2.3-spatial-upscaler-x2-1.0.safetensors")
 gemma_root = snapshot_download(repo_id=GEMMA_REPO)
 print(f"Checkpoint: {checkpoint_path}")
 print(f"Spatial upsampler: {spatial_upsampler_path}")
 print(f"Gemma root: {gemma_root}")
 # Build initial pipeline with the first IC-LoRA
 default_lora_name = "Union Control (Depth + Canny)"
 current_pipeline = None
 current_lora_name = None
+class AudioConditionedICLoraPipeline(ICLoraPipeline):
+    """IC-LoRA pipeline with optional audio conditioning, adapted from multimodalart's audio-input Space."""
+    def __call__(
+        self,
+        prompt: str,
+        seed: int,
+        height: int,
+        width: int,
+        num_frames: int,
+        frame_rate: float,
+        images: list[ImageConditioningInput],
+        video_conditioning: list[tuple[str, float]],
+        audio_path: str | None = None,
+        enhance_prompt: bool = False,
+        tiling_config: TilingConfig | None = None,
+        conditioning_attention_strength: float = 1.0,
+        skip_stage_2: bool = False,
+        conditioning_attention_mask: torch.Tensor | None = None,
+    ):
+        if audio_path is None:
+            return super().__call__(
+                prompt=prompt,
+                seed=seed,
+                height=height,
+                width=width,
+                num_frames=num_frames,
+                frame_rate=frame_rate,
+                images=images,
+                video_conditioning=video_conditioning,
+                enhance_prompt=enhance_prompt,
+                tiling_config=tiling_config,
+                conditioning_attention_strength=conditioning_attention_strength,
+                skip_stage_2=skip_stage_2,
+                conditioning_attention_mask=conditioning_attention_mask,
+            )
+        generator = torch.Generator(device=self.device).manual_seed(seed)
+        noiser = GaussianNoiser(generator=generator)
+        stepper = EulerDiffusionStep()
+        dtype = torch.bfloat16
+        (ctx_p,) = encode_prompts(
+            [prompt],
+            self.stage_1_model_ledger,
+            enhance_first_prompt=enhance_prompt,
+            enhance_prompt_image=images[0].path if len(images) > 0 else None,
+            enhance_prompt_seed=seed,
+        )
+        video_context, audio_context = ctx_p.video_encoding, ctx_p.audio_encoding
+        video_duration = num_frames / frame_rate
+        decoded_audio = decode_audio_from_file(audio_path, self.device, 0.0, video_duration)
+        if decoded_audio is None:
+            raise ValueError(f"Could not extract audio stream from {audio_path}")
+        encoded_audio_latent = vae_encode_audio(decoded_audio, self.stage_1_model_ledger.audio_encoder())
+        audio_shape = AudioLatentShape.from_duration(batch=1, duration=video_duration, channels=8, mel_bins=16)
+        expected_frames = audio_shape.frames
+        actual_frames = encoded_audio_latent.shape[2]
+        if actual_frames > expected_frames:
+            encoded_audio_latent = encoded_audio_latent[:, :, :expected_frames, :]
+        elif actual_frames < expected_frames:
+            pad = torch.zeros(
+                encoded_audio_latent.shape[0],
+                encoded_audio_latent.shape[1],
+                expected_frames - actual_frames,
+                encoded_audio_latent.shape[3],
+                device=encoded_audio_latent.device,
+                dtype=encoded_audio_latent.dtype,
+            )
+            encoded_audio_latent = torch.cat([encoded_audio_latent, pad], dim=2)
+        stage_1_output_shape = VideoPixelShape(batch=1, frames=num_frames, width=width // 2, height=height // 2, fps=frame_rate)
+        video_encoder = self.stage_1_model_ledger.video_encoder()
+        stage_1_conditionings = self._create_conditionings(
+            images=images,
+            video_conditioning=video_conditioning,
+            height=stage_1_output_shape.height,
+            width=stage_1_output_shape.width,
+            video_encoder=video_encoder,
+            num_frames=num_frames,
+            conditioning_attention_strength=conditioning_attention_strength,
+            conditioning_attention_mask=conditioning_attention_mask,
+        )
+        transformer = self.stage_1_model_ledger.transformer()
+        stage_1_sigmas = torch.Tensor(DISTILLED_SIGMA_VALUES).to(self.device)
+        def first_stage_denoising_loop(sigmas, video_state, audio_state, stepper):
+            return euler_denoising_loop(
+                sigmas=sigmas,
+                video_state=video_state,
+                audio_state=audio_state,
+                stepper=stepper,
+                denoise_fn=simple_denoising_func(
+                    video_context=video_context,
+                    audio_context=audio_context,
+                    transformer=transformer,
+                ),
+            )
+        video_state, audio_state = denoise_audio_video(
+            output_shape=stage_1_output_shape,
+            conditionings=stage_1_conditionings,
+            noiser=noiser,
+            sigmas=stage_1_sigmas,
+            stepper=stepper,
+            denoising_loop_fn=first_stage_denoising_loop,
+            components=self.pipeline_components,
+            dtype=dtype,
+            device=self.device,
+            initial_audio_latent=encoded_audio_latent,
+        )
+        torch.cuda.synchronize()
+        del transformer
+        cleanup_memory()
+        if skip_stage_2:
+            decoded_video = vae_decode_video(
+                video_state.latent, self.stage_1_model_ledger.video_decoder(), tiling_config, generator
+            )
+            original_audio = Audio(waveform=decoded_audio.waveform.squeeze(0), sampling_rate=decoded_audio.sampling_rate)
+            del video_encoder
+            cleanup_memory()
+            return decoded_video, original_audio
+        upscaled_video_latent = upsample_video(
+            latent=video_state.latent[:1],
+            video_encoder=video_encoder,
+            upsampler=self.stage_2_model_ledger.spatial_upsampler(),
+        )
+        torch.cuda.synchronize()
+        cleanup_memory()
+        transformer = self.stage_2_model_ledger.transformer()
+        stage_2_sigmas = torch.Tensor(STAGE_2_DISTILLED_SIGMA_VALUES).to(self.device)
+        def second_stage_denoising_loop(sigmas, video_state, audio_state, stepper):
+            return euler_denoising_loop(
+                sigmas=sigmas,
+                video_state=video_state,
+                audio_state=audio_state,
+                stepper=stepper,
+                denoise_fn=simple_denoising_func(
+                    video_context=video_context,
+                    audio_context=audio_context,
+                    transformer=transformer,
+                ),
+            )
+        stage_2_output_shape = VideoPixelShape(batch=1, frames=num_frames, width=width, height=height, fps=frame_rate)
+        stage_2_conditionings = self._create_conditionings(
+            images=images,
+            video_conditioning=video_conditioning,
+            height=stage_2_output_shape.height,
+            width=stage_2_output_shape.width,
+            video_encoder=video_encoder,
+            num_frames=num_frames,
+            conditioning_attention_strength=conditioning_attention_strength,
+            conditioning_attention_mask=conditioning_attention_mask,
+        )
+        video_state, audio_state = denoise_audio_video(
+            output_shape=stage_2_output_shape,
+            conditionings=stage_2_conditionings,
+            noiser=noiser,
+            sigmas=stage_2_sigmas,
+            stepper=stepper,
+            denoising_loop_fn=second_stage_denoising_loop,
+            components=self.pipeline_components,
+            dtype=dtype,
+            device=self.device,
+            noise_scale=stage_2_sigmas[0],
+            initial_video_latent=upscaled_video_latent,
+            initial_audio_latent=encoded_audio_latent,
+        )
+        torch.cuda.synchronize()
+        del transformer
+        del video_encoder
+        cleanup_memory()
+        decoded_video = vae_decode_video(
+            video_state.latent, self.stage_2_model_ledger.video_decoder(), tiling_config, generator
+        )
+        original_audio = Audio(waveform=decoded_audio.waveform.squeeze(0), sampling_rate=decoded_audio.sampling_rate)
+        return decoded_video, original_audio
+def build_pipeline(lora_name: str) -> AudioConditionedICLoraPipeline:
+    """Build the fused IC-LoRA pipeline with optional audio conditioning."""
+    pipe = AudioConditionedICLoraPipeline(
         distilled_checkpoint_path=checkpoint_path,
         spatial_upsampler_path=spatial_upsampler_path,
         gemma_root=gemma_root,
         loras=[],
         quantization=QuantizationPolicy.fp8_cast(),
     )
     return pipe
 def generate_video(
     input_image,
     conditioning_video,
+    input_audio,
     prompt: str,
     duration: float,
     ic_lora_choice: str,
         print(f"Generating: {height}x{width}, {num_frames} frames ({duration}s), seed={current_seed}")
         print(f"IC-LoRA: {ic_lora_choice}, conditioning_strength: {conditioning_strength}")
+        if input_audio is not None:
+            print(f"Audio conditioning: {input_audio}")
         output_dir = Path("outputs")
         output_dir.mkdir(exist_ok=True)
             frame_rate=frame_rate,
             images=images,
             video_conditioning=video_conditioning,
+            audio_path=input_audio,
             tiling_config=tiling_config,
             enhance_prompt=enhance_prompt,
             conditioning_attention_strength=1.0,
 with gr.Blocks(title="LTX-2.3 IC-LoRA") as demo:
+    gr.Markdown("# LTX-2.3 IC-LoRA: Video-to-Video + Audio Conditioning")
     gr.Markdown(
+        "Video-to-video transformations using IC-LoRA conditioning with optional audio-driven generation. "
+        "Upload a **conditioning video** as the IC-LoRA reference signal, optionally add an **input audio** file "
+        "to preserve soundtrack or lip-sync timing, optionally provide an input image for I2V, and describe the desired output. "
         "[[model]](https://huggingface.co/Lightricks/LTX-2.3) "
         "[[code]](https://github.com/Lightricks/LTX-2)"
     )
                 sources=["upload"],
             )
             input_image = gr.Image(label="Input Image (Optional)", type="pil")
+            input_audio = gr.Audio(label="Input Audio (Optional)", type="filepath", sources=["upload"])
             prompt = gr.Textbox(
                 label="Prompt",
                 info="Describe the desired output — the IC-LoRA controls structure from the reference",
         fn=generate_video,
         inputs=[
             input_image, conditioning_video,
+            input_audio, prompt, duration, ic_lora_choice, conditioning_strength,
             enhance_prompt, skip_stage_2,
             seed, randomize_seed, height, width,
         ],