LTX-2-3-sync

Paused

App Files Files Community

linoyts HF Staff commited on Mar 10

Commit

95a227e

verified ·

1 Parent(s): 21c3a50

Update app.py

Browse files

Files changed (1) hide show

app.py +566 -380

app.py CHANGED Viewed

@@ -10,18 +10,12 @@ os.environ["TORCHDYNAMO_DISABLE"] = "1"
 subprocess.run([sys.executable, "-m", "pip", "install", "xformers==0.0.32.post2", "--no-build-isolation"], check=False)
 # Clone LTX-2 repo and install packages
-LTX_REPO_URL = "https://github.com/linoytsaban/LTX-2.git"
-LTX_REPO_BRANCH = "patch-1"
 LTX_REPO_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "LTX-2")
-if os.path.exists(LTX_REPO_DIR):
-    subprocess.run(["rm", "-rf", LTX_REPO_DIR], check=True)
-print(f"Cloning {LTX_REPO_URL}@{LTX_REPO_BRANCH}...")
-subprocess.run(
-    ["git", "clone", "--depth", "1", "--branch", LTX_REPO_BRANCH, LTX_REPO_URL, LTX_REPO_DIR],
-    check=True,
-)
 print("Installing ltx-core and ltx-pipelines from cloned repo...")
 subprocess.run(
@@ -31,7 +25,6 @@ subprocess.run(
     check=True,
 )
 sys.path.insert(0, os.path.join(LTX_REPO_DIR, "packages", "ltx-pipelines", "src"))
 sys.path.insert(0, os.path.join(LTX_REPO_DIR, "packages", "ltx-core", "src"))
@@ -48,21 +41,42 @@ import spaces
 import gradio as gr
 import numpy as np
 from huggingface_hub import hf_hub_download, snapshot_download
-from ltx_core.model.video_vae import TilingConfig, get_video_chunks_number
-from ltx_core.quantization import QuantizationPolicy
 from ltx_core.components.diffusion_steps import EulerDiffusionStep
 from ltx_core.components.noisers import GaussianNoiser
 from ltx_core.model.audio_vae import encode_audio as vae_encode_audio
 from ltx_core.model.upsampler import upsample_video
 from ltx_core.model.video_vae import decode_video as vae_decode_video
-from ltx_core.types import Audio, AudioLatentShape, VideoPixelShape
-from ltx_pipelines.ic_lora import ICLoraPipeline
-from ltx_pipelines.utils.helpers import combined_image_conditionings
-from ltx_pipelines.utils import cleanup_memory, denoise_audio_video, encode_prompts, euler_denoising_loop, simple_denoising_func
 from ltx_pipelines.utils.args import ImageConditioningInput
 from ltx_pipelines.utils.constants import DISTILLED_SIGMA_VALUES, STAGE_2_DISTILLED_SIGMA_VALUES
-from ltx_pipelines.utils.media_io import decode_audio_from_file, encode_video
 # Force-patch xformers attention into the LTX attention module.
 from ltx_core.model.transformer import attention as _attn_mod
@@ -76,55 +90,139 @@ except Exception as e:
 logging.getLogger().setLevel(logging.INFO)
-MAX_SEED = np.iinfo(np.int32).max
-DEFAULT_FRAME_RATE = 24.0
-# Resolution presets: (width, height)
-RESOLUTIONS = {
-    "high": {"16:9": (1536, 1024), "9:16": (1024, 1536), "1:1": (1024, 1024)},
-    "low": {"16:9": (768, 512), "9:16": (512, 768), "1:1": (768, 768)},
-}
-# Model repos
-LTX_MODEL_REPO = "diffusers-internal-dev/ltx-23"
-GEMMA_REPO = "google/gemma-3-12b-it-qat-q4_0-unquantized"
-# Available IC-LoRAs for LTX-2.3 (22B)
-IC_LORA_OPTIONS = {
-    "Union Control (Depth + Canny)": {
-        "repo": "Lightricks/LTX-2.3-22b-IC-LoRA-Union-Control",
-        "filename": "ltx-2.3-22b-ic-lora-union-control-ref0.5.safetensors",
-    },
-    "Motion Track Control": {
-        "repo": "Lightricks/LTX-2.3-22b-IC-LoRA-Motion-Track-Control",
-        "filename": "ltx-2.3-22b-ic-lora-motion-track-control-ref0.5.safetensors",
-    },
-}
-# Download model checkpoints
-print("=" * 80)
-print("Downloading LTX-2.3 distilled model + Gemma + IC-LoRAs...")
-print("=" * 80)
-# checkpoint_path = hf_hub_download(repo_id=LTX_MODEL_REPO, filename="ltx-2.3-22b-distilled.safetensors")
-checkpoint_path = hf_hub_download(repo_id="linoyts/ltx-2.3-22b-distilled-motion-track-control-fused", filename="ltx-2.3-22b-distilled-motion-track-control-fused.safetensors")
-spatial_upsampler_path = hf_hub_download(repo_id=LTX_MODEL_REPO, filename="ltx-2.3-spatial-upscaler-x2-1.0.safetensors")
-gemma_root = snapshot_download(repo_id=GEMMA_REPO)
-print(f"Checkpoint: {checkpoint_path}")
-print(f"Spatial upsampler: {spatial_upsampler_path}")
-print(f"Gemma root: {gemma_root}")
-# Build initial pipeline with the first IC-LoRA
-default_lora_name = "Union Control (Depth + Canny)"
-current_pipeline = None
-current_lora_name = None
-class AudioConditionedICLoraPipeline(ICLoraPipeline):
-    """IC-LoRA pipeline with optional audio conditioning, adapted from multimodalart's audio-input Space."""
     def __call__(
         self,
         prompt: str,
@@ -134,85 +232,111 @@ class AudioConditionedICLoraPipeline(ICLoraPipeline):
         num_frames: int,
         frame_rate: float,
         images: list[ImageConditioningInput],
-        video_conditioning: list[tuple[str, float]],
         audio_path: str | None = None,
-        enhance_prompt: bool = False,
         tiling_config: TilingConfig | None = None,
-        conditioning_attention_strength: float = 1.0,
-        skip_stage_2: bool = False,
-        conditioning_attention_mask: torch.Tensor | None = None,
     ):
-        if audio_path is None:
-            return super().__call__(
-                prompt=prompt,
-                seed=seed,
-                height=height,
-                width=width,
-                num_frames=num_frames,
-                frame_rate=frame_rate,
-                images=images,
-                video_conditioning=video_conditioning,
-                enhance_prompt=enhance_prompt,
-                tiling_config=tiling_config,
-                conditioning_attention_strength=conditioning_attention_strength,
-                skip_stage_2=skip_stage_2,
-                conditioning_attention_mask=conditioning_attention_mask,
-            )
         generator = torch.Generator(device=self.device).manual_seed(seed)
         noiser = GaussianNoiser(generator=generator)
         stepper = EulerDiffusionStep()
         dtype = torch.bfloat16
         (ctx_p,) = encode_prompts(
             [prompt],
             self.stage_1_model_ledger,
             enhance_first_prompt=enhance_prompt,
             enhance_prompt_image=images[0].path if len(images) > 0 else None,
-            enhance_prompt_seed=seed,
         )
         video_context, audio_context = ctx_p.video_encoding, ctx_p.audio_encoding
-        video_duration = num_frames / frame_rate
-        decoded_audio = decode_audio_from_file(audio_path, self.device, 0.0, video_duration)
-        if decoded_audio is None:
-            raise ValueError(f"Could not extract audio stream from {audio_path}")
-        encoded_audio_latent = vae_encode_audio(decoded_audio, self.stage_1_model_ledger.audio_encoder())
-        audio_shape = AudioLatentShape.from_duration(batch=1, duration=video_duration, channels=8, mel_bins=16)
-        expected_frames = audio_shape.frames
-        actual_frames = encoded_audio_latent.shape[2]
-        if actual_frames > expected_frames:
-            encoded_audio_latent = encoded_audio_latent[:, :, :expected_frames, :]
-        elif actual_frames < expected_frames:
-            pad = torch.zeros(
-                encoded_audio_latent.shape[0],
-                encoded_audio_latent.shape[1],
-                expected_frames - actual_frames,
-                encoded_audio_latent.shape[3],
-                device=encoded_audio_latent.device,
-                dtype=encoded_audio_latent.dtype,
             )
-            encoded_audio_latent = torch.cat([encoded_audio_latent, pad], dim=2)
-        stage_1_output_shape = VideoPixelShape(batch=1, frames=num_frames, width=width // 2, height=height // 2, fps=frame_rate)
         video_encoder = self.stage_1_model_ledger.video_encoder()
-        stage_1_conditionings = self._create_conditionings(
             images=images,
-            video_conditioning=video_conditioning,
             height=stage_1_output_shape.height,
             width=stage_1_output_shape.width,
             video_encoder=video_encoder,
-            num_frames=num_frames,
-            conditioning_attention_strength=conditioning_attention_strength,
-            conditioning_attention_mask=conditioning_attention_mask,
         )
         transformer = self.stage_1_model_ledger.transformer()
         stage_1_sigmas = torch.Tensor(DISTILLED_SIGMA_VALUES).to(self.device)
-        def first_stage_denoising_loop(sigmas, video_state, audio_state, stepper):
             return euler_denoising_loop(
                 sigmas=sigmas,
                 video_state=video_state,
@@ -225,32 +349,39 @@ class AudioConditionedICLoraPipeline(ICLoraPipeline):
                 ),
             )
-        video_state, audio_state = denoise_audio_video(
-            output_shape=stage_1_output_shape,
-            conditionings=stage_1_conditionings,
-            noiser=noiser,
-            sigmas=stage_1_sigmas,
-            stepper=stepper,
-            denoising_loop_fn=first_stage_denoising_loop,
-            components=self.pipeline_components,
-            dtype=dtype,
-            device=self.device,
-            initial_audio_latent=encoded_audio_latent,
-        )
         torch.cuda.synchronize()
-        del transformer
         cleanup_memory()
-        if skip_stage_2:
-            decoded_video = vae_decode_video(
-                video_state.latent, self.stage_1_model_ledger.video_decoder(), tiling_config, generator
-            )
-            original_audio = Audio(waveform=decoded_audio.waveform.squeeze(0), sampling_rate=decoded_audio.sampling_rate)
-            del video_encoder
-            cleanup_memory()
-            return decoded_video, original_audio
         upscaled_video_latent = upsample_video(
             latent=video_state.latent[:1],
             video_encoder=video_encoder,
@@ -260,10 +391,11 @@ class AudioConditionedICLoraPipeline(ICLoraPipeline):
         torch.cuda.synchronize()
         cleanup_memory()
-        transformer = self.stage_2_model_ledger.transformer()
         stage_2_sigmas = torch.Tensor(STAGE_2_DISTILLED_SIGMA_VALUES).to(self.device)
-        def second_stage_denoising_loop(sigmas, video_state, audio_state, stepper):
             return euler_denoising_loop(
                 sigmas=sigmas,
                 video_state=video_state,
@@ -272,275 +404,334 @@ class AudioConditionedICLoraPipeline(ICLoraPipeline):
                 denoise_fn=simple_denoising_func(
                     video_context=video_context,
                     audio_context=audio_context,
-                    transformer=transformer,
                 ),
             )
-        stage_2_output_shape = VideoPixelShape(batch=1, frames=num_frames, width=width, height=height, fps=frame_rate)
         stage_2_conditionings = combined_image_conditionings(
             images=images,
-            #video_conditioning=video_conditioning,
             height=stage_2_output_shape.height,
             width=stage_2_output_shape.width,
             video_encoder=video_encoder,
             dtype=dtype,
             device=self.device,
-            # num_frames=num_frames,
-            # conditioning_attention_strength=conditioning_attention_strength,
-            # conditioning_attention_mask=conditioning_attention_mask,
         )
-        video_state, audio_state = denoise_audio_video(
-            output_shape=stage_2_output_shape,
-            conditionings=stage_2_conditionings,
-            noiser=noiser,
-            sigmas=stage_2_sigmas,
-            stepper=stepper,
-            denoising_loop_fn=second_stage_denoising_loop,
-            components=self.pipeline_components,
-            dtype=dtype,
-            device=self.device,
-            noise_scale=stage_2_sigmas[0],
-            initial_video_latent=upscaled_video_latent,
-            initial_audio_latent=encoded_audio_latent,
-        )
         torch.cuda.synchronize()
-        del transformer
-        del video_encoder
         cleanup_memory()
         decoded_video = vae_decode_video(
-            video_state.latent, self.stage_2_model_ledger.video_decoder(), tiling_config, generator
         )
-        original_audio = Audio(waveform=decoded_audio.waveform.squeeze(0), sampling_rate=decoded_audio.sampling_rate)
-        return decoded_video, original_audio
-def build_pipeline(lora_name: str) -> AudioConditionedICLoraPipeline:
-    """Build the fused IC-LoRA pipeline with optional audio conditioning."""
-    pipe = AudioConditionedICLoraPipeline(
-        distilled_checkpoint_path=checkpoint_path,
-        spatial_upsampler_path=spatial_upsampler_path,
-        gemma_root=gemma_root,
-        loras=[],
-        quantization=QuantizationPolicy.fp8_cast(),
-    )
-    return pipe
-def preload_pipeline(pipe: ICLoraPipeline) -> None:
-    """Preload all models from both ledgers for ZeroGPU tensor packing."""
-    print("Preloading stage 1 models (with IC-LoRA)...")
-    import gc
-    def cleanup_vram():
-        gc.collect()
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-    # Preload all models for ZeroGPU tensor packing.
-    print("Preloading stage 1 models (with IC-LoRA)...")
-    s1 = pipe.stage_1_model_ledger
-    _s1_text_encoder = s1.text_encoder()
-    _s1_embeddings_processor = s1.gemma_embeddings_processor()
-    _s1_video_encoder = s1.video_encoder()
-    cleanup_vram()
-    _s1_transformer = s1.transformer()
-    cleanup_vram()
-    s1.transformer = lambda: _s1_transformer
-    s1.video_encoder = lambda: _s1_video_encoder
-    s1.text_encoder = lambda: _s1_text_encoder
-    s1.gemma_embeddings_processor = lambda: _s1_embeddings_processor
-    # Free stage 1 builders — we've cached the built models via lambdas
-    if hasattr(s1, 'transformer_builder'): del s1.transformer_builder
-    if hasattr(s1, 'vae_encoder_builder'): del s1.vae_encoder_builder
-    if hasattr(s1, 'text_encoder_builder'): del s1.text_encoder_builder
-    if hasattr(s1, 'embeddings_processor_builder'): del s1.embeddings_processor_builder
-    cleanup_vram()
-    print("Preloading stage 2 models (without IC-LoRA)...")
-    s2 = pipe.stage_2_model_ledger
-    _s2_video_encoder = s2.video_encoder()
-    _s2_video_decoder = s2.video_decoder()
-    _s2_audio_decoder = s2.audio_decoder()
-    _s2_vocoder = s2.vocoder()
-    _s2_spatial_upsampler = s2.spatial_upsampler()
-    cleanup_vram()
-    _s2_transformer = s2.transformer()
-    cleanup_vram()
-    s2.transformer = lambda: _s2_transformer
-    s2.video_encoder = lambda: _s2_video_encoder
-    s2.video_decoder = lambda: _s2_video_decoder
-    s2.audio_decoder = lambda: _s2_audio_decoder
-    s2.vocoder = lambda: _s2_vocoder
-    s2.spatial_upsampler = lambda: _s2_spatial_upsampler
-    # Free stage 2 builders
-    if hasattr(s2, 'transformer_builder'): del s2.transformer_builder
-    if hasattr(s2, 'vae_encoder_builder'): del s2.vae_encoder_builder
-    if hasattr(s2, 'vae_decoder_builder'): del s2.vae_decoder_builder
-    if hasattr(s2, 'audio_decoder_builder'): del s2.audio_decoder_builder
-    if hasattr(s2, 'vocoder_builder'): del s2.vocoder_builder
-    if hasattr(s2, 'upsampler_builder'): del s2.upsampler_builder
-    cleanup_vram()
-    print("All models preloaded!")
-print(f"Building initial pipeline with IC-LoRA: {default_lora_name}")
-current_pipeline = build_pipeline(default_lora_name)
-current_lora_name = default_lora_name
-preload_pipeline(current_pipeline)
 print("=" * 80)
-print("Pipeline ready!")
 print("=" * 80)
-def log_memory(tag: str):
-    if torch.cuda.is_available():
-        allocated = torch.cuda.memory_allocated() / 1024**3
-        peak = torch.cuda.max_memory_allocated() / 1024**3
-        free, total = torch.cuda.mem_get_info()
-        print(f"[VRAM {tag}] allocated={allocated:.2f}GB peak={peak:.2f}GB free={free / 1024**3:.2f}GB total={total / 1024**3:.2f}GB")
-def detect_aspect_ratio(media) -> str:
     """Detect the closest aspect ratio from an image or video."""
-    if media is None:
         return "16:9"
-    if hasattr(media, "size"):
-        w, h = media.size
-    elif hasattr(media, "shape"):
-        h, w = media.shape[:2]
     else:
-        return "16:9"
     ratio = w / h
     candidates = {"16:9": 16 / 9, "9:16": 9 / 16, "1:1": 1.0}
     return min(candidates, key=lambda k: abs(ratio - candidates[k]))
-def on_media_upload(input_image, high_res):
     """Auto-set resolution when image is uploaded."""
-    aspect = detect_aspect_ratio(input_image)
     tier = "high" if high_res else "low"
     w, h = RESOLUTIONS[tier][aspect]
     return gr.update(value=w), gr.update(value=h)
-def on_highres_toggle(input_image, high_res):
     """Update resolution when high-res toggle changes."""
-    aspect = detect_aspect_ratio(input_image)
     tier = "high" if high_res else "low"
     w, h = RESOLUTIONS[tier][aspect]
     return gr.update(value=w), gr.update(value=h)
-def extract_audio_from_video(video_path: str, output_dir: Path, seed: int) -> str:
-    """Extract the audio track from a video into a wav file for audio conditioning."""
-    audio_output = output_dir / f"extracted_audio_{seed}.wav"
-    cmd = [
-        "ffmpeg", "-y",
-        "-i", str(video_path),
-        "-vn",
-        "-acodec", "pcm_s16le",
-        "-ar", "48000",
-        "-ac", "2",
-        str(audio_output),
-    ]
-    result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
-    if result.returncode != 0 or not audio_output.exists() or audio_output.stat().st_size == 0:
-        raise gr.Error("Could not extract audio from the conditioning video. Make sure the uploaded video contains an audio track.")
-    return str(audio_output)
-@spaces.GPU(duration=120, size="xlarge")
 @torch.inference_mode()
 def generate_video(
     input_image,
-    conditioning_video,
     input_audio,
-    use_video_audio,
     prompt: str,
     duration: float,
-    ic_lora_choice: str,
     conditioning_strength: float,
     enhance_prompt: bool,
-    skip_stage_2: bool,
     seed: int,
     randomize_seed: bool,
     height: int,
     width: int,
     progress=gr.Progress(track_tqdm=True),
 ):
-    global current_pipeline, current_lora_name
     try:
         torch.cuda.reset_peak_memory_stats()
-        log_memory("start")
-        # Rebuild pipeline if IC-LoRA changed
-        if ic_lora_choice != current_lora_name:
-            print(f"Switching IC-LoRA: {current_lora_name} → {ic_lora_choice}")
-            current_pipeline = build_pipeline(ic_lora_choice)
-            current_lora_name = ic_lora_choice
-            preload_pipeline(current_pipeline)
         current_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed)
         frame_rate = DEFAULT_FRAME_RATE
         num_frames = int(duration * frame_rate) + 1
         num_frames = ((num_frames - 1 + 7) // 8) * 8 + 1
-        print(f"Generating: {height}x{width}, {num_frames} frames ({duration}s), seed={current_seed}")
-        print(f"IC-LoRA: {ic_lora_choice}, conditioning_strength: {conditioning_strength}")
         if input_audio is not None:
-            print(f"Audio conditioning: {input_audio}")
-        output_dir = Path("outputs")
-        output_dir.mkdir(exist_ok=True)
-        # Image conditioning (optional, for I2V)
         images = []
         if input_image is not None:
-            temp_image_path = output_dir / f"temp_input_{current_seed}.jpg"
-            if hasattr(input_image, "save"):
-                input_image.save(temp_image_path)
-            else:
-                temp_image_path = Path(input_image)
-            images.append(ImageConditioningInput(path=str(temp_image_path), frame_idx=0, strength=1.0))
-        # Video conditioning for IC-LoRA (reference video)
-        video_conditioning = []
-        video_path = None
-        if conditioning_video is not None:
-            video_path = str(conditioning_video)
-            video_conditioning.append((video_path, conditioning_strength))
-            print(f"Video conditioning: {video_path} (strength={conditioning_strength})")
-        audio_path = input_audio
-        if use_video_audio:
-            if video_path is None:
-                raise gr.Error("Enable 'Use audio from conditioning video' only when a conditioning video is uploaded.")
-            audio_path = extract_audio_from_video(video_path, output_dir, current_seed)
-            print(f"Extracted audio from conditioning video: {audio_path}")
-        elif audio_path is not None:
-            print(f"Using uploaded audio file: {audio_path}")
         tiling_config = TilingConfig.default()
         video_chunks_number = get_video_chunks_number(num_frames, tiling_config)
-        log_memory("before pipeline call")
-        video, audio = current_pipeline(
             prompt=prompt,
             seed=current_seed,
             height=int(height),
@@ -548,16 +739,13 @@ def generate_video(
             num_frames=num_frames,
             frame_rate=frame_rate,
             images=images,
             video_conditioning=video_conditioning,
-            audio_path=audio_path,
             tiling_config=tiling_config,
             enhance_prompt=enhance_prompt,
-            conditioning_attention_strength=conditioning_strength,
-            skip_stage_2=skip_stage_2,
         )
-        log_memory("after pipeline call")
         output_path = tempfile.mktemp(suffix=".mp4")
         encode_video(
             video=video,
@@ -567,74 +755,73 @@ def generate_video(
             video_chunks_number=video_chunks_number,
         )
-        log_memory("after encode_video")
         return str(output_path), current_seed
-    except gr.Error:
-        raise
     except Exception as e:
         import traceback
-        log_memory("on error")
         print(f"Error: {str(e)}\n{traceback.format_exc()}")
         return None, current_seed
-with gr.Blocks(title="LTX-2.3 IC-LoRA") as demo:
-    gr.Markdown("# LTX-2.3 IC-LoRA: Video-to-Video + Audio Conditioning")
     gr.Markdown(
-        "Video-to-video transformations using IC-LoRA conditioning with optional audio-driven generation. "
-        "Upload a **conditioning video** as the IC-LoRA reference signal, optionally add an **input audio** file "
-        "to preserve soundtrack or lip-sync timing, optionally provide an input image for I2V, and describe the desired output. "
         "[[model]](https://huggingface.co/Lightricks/LTX-2.3) "
         "[[code]](https://github.com/Lightricks/LTX-2)"
     )
     with gr.Row():
         with gr.Column():
             with gr.Row():
-                conditioning_video = gr.Video(
-                    label="Conditioning Video (IC-LoRA Reference)",
                 )
-                input_image = gr.Image(label="Input Image (Optional)", type="pil")
-            input_audio = gr.Audio(label="Input Audio (Optional)", type="filepath")
-            use_video_audio = gr.Checkbox(
-                label="Use audio from reference video",
-                value=False,
-                info="extracts the audio track from the uploaded video",
             )
             prompt = gr.Textbox(
                 label="Prompt",
-                info="Describe the desired output — the IC-LoRA controls structure from the reference",
-                value="A cinematic scene with dramatic lighting and rich detail, smooth motion",
                 lines=3,
-                placeholder="Describe the video you want to generate...",
             )
             with gr.Row():
-                duration = gr.Slider(label="Duration (seconds)", minimum=1.0, maximum=10.0, value=3.0, step=0.1)
-                ic_lora_choice = gr.Dropdown(
-                    label="IC-LoRA",
-                    choices=list(IC_LORA_OPTIONS.keys()),
-                    value=default_lora_name,
-                    visible=False
                 )
             generate_btn = gr.Button("Generate Video", variant="primary", size="lg")
             with gr.Accordion("Advanced Settings", open=False):
-                seed = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, value=42, step=1)
                 randomize_seed = gr.Checkbox(label="Randomize Seed", value=True)
-                with gr.Row():
-                    conditioning_strength = gr.Slider(
-                        label="Conditioning Strength", minimum=0.1, maximum=1.0, value=1.0, step=0.05,
-                    )
-                with gr.Column():
-                    enhance_prompt = gr.Checkbox(label="Enhance Prompt", value=False)
-                    high_res = gr.Checkbox(label="High Resolution", value=True)
-                    skip_stage_2 = gr.Checkbox(label="Skip Stage 2 (faster, half res)", value=False)
                 with gr.Row():
                     width = gr.Number(label="Width", value=1536, precision=0)
                     height = gr.Number(label="Height", value=1024, precision=0)
@@ -642,33 +829,32 @@ with gr.Blocks(title="LTX-2.3 IC-LoRA") as demo:
         with gr.Column():
             output_video = gr.Video(label="Generated Video", autoplay=True)
-    # Auto-detect aspect ratio from uploaded image
     input_image.change(
-        fn=on_media_upload,
-        inputs=[input_image, high_res],
         outputs=[width, height],
     )
     high_res.change(
         fn=on_highres_toggle,
-        inputs=[input_image, high_res],
         outputs=[width, height],
     )
     generate_btn.click(
         fn=generate_video,
         inputs=[
-            input_image, conditioning_video,
-            input_audio, use_video_audio, prompt, duration, ic_lora_choice, conditioning_strength,
-            enhance_prompt, skip_stage_2,
             seed, randomize_seed, height, width,
         ],
         outputs=[output_video, seed],
     )
-css = """
-.fillable{max-width: 1200px !important}
-"""
 if __name__ == "__main__":
-    demo.launch(theme=gr.themes.Citrus(), css=css)

 subprocess.run([sys.executable, "-m", "pip", "install", "xformers==0.0.32.post2", "--no-build-isolation"], check=False)
 # Clone LTX-2 repo and install packages
+LTX_REPO_URL = "https://github.com/Lightricks/LTX-2.git"
 LTX_REPO_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "LTX-2")
+if not os.path.exists(LTX_REPO_DIR):
+    print(f"Cloning {LTX_REPO_URL}...")
+    subprocess.run(["git", "clone", "--depth", "1", LTX_REPO_URL, LTX_REPO_DIR], check=True)
 print("Installing ltx-core and ltx-pipelines from cloned repo...")
 subprocess.run(
     check=True,
 )
 sys.path.insert(0, os.path.join(LTX_REPO_DIR, "packages", "ltx-pipelines", "src"))
 sys.path.insert(0, os.path.join(LTX_REPO_DIR, "packages", "ltx-core", "src"))
 import gradio as gr
 import numpy as np
 from huggingface_hub import hf_hub_download, snapshot_download
+from safetensors import safe_open
 from ltx_core.components.diffusion_steps import EulerDiffusionStep
 from ltx_core.components.noisers import GaussianNoiser
+from ltx_core.conditioning import (
+    ConditioningItem,
+    ConditioningItemAttentionStrengthWrapper,
+    VideoConditionByReferenceLatent,
+)
+from ltx_core.loader import LoraPathStrengthAndSDOps, LTXV_LORA_COMFY_RENAMING_MAP
+from ltx_core.model.audio_vae import decode_audio as vae_decode_audio
 from ltx_core.model.audio_vae import encode_audio as vae_encode_audio
 from ltx_core.model.upsampler import upsample_video
+from ltx_core.model.video_vae import TilingConfig, VideoEncoder, get_video_chunks_number
 from ltx_core.model.video_vae import decode_video as vae_decode_video
+from ltx_core.quantization import QuantizationPolicy
+from ltx_core.types import Audio, AudioLatentShape, LatentState, VideoLatentShape, VideoPixelShape
+from ltx_pipelines.utils import ModelLedger, euler_denoising_loop
 from ltx_pipelines.utils.args import ImageConditioningInput
 from ltx_pipelines.utils.constants import DISTILLED_SIGMA_VALUES, STAGE_2_DISTILLED_SIGMA_VALUES
+from ltx_pipelines.utils.helpers import (
+    assert_resolution,
+    cleanup_memory,
+    combined_image_conditionings,
+    denoise_audio_video,
+    denoise_video_only,
+    encode_prompts,
+    get_device,
+    simple_denoising_func,
+)
+from ltx_pipelines.utils.media_io import (
+    decode_audio_from_file,
+    encode_video,
+    load_video_conditioning,
+)
+from ltx_pipelines.utils.types import PipelineComponents
 # Force-patch xformers attention into the LTX attention module.
 from ltx_core.model.transformer import attention as _attn_mod
 logging.getLogger().setLevel(logging.INFO)
+# ─────────────────────────────────────────────────────────────────────────────
+# Helper: read reference downscale factor from IC-LoRA metadata
+# ─────────────────────────────────────────────────────────────────────────────
+def _read_lora_reference_downscale_factor(lora_path: str) -> int:
+    try:
+        with safe_open(lora_path, framework="pt") as f:
+            metadata = f.metadata() or {}
+            return int(metadata.get("reference_downscale_factor", 1))
+    except Exception as e:
+        logging.warning(f"Failed to read metadata from LoRA file '{lora_path}': {e}")
+        return 1
+# ─────────────────────────────────────────────────────────────────────────────
+# Unified Pipeline: Distilled + Audio + IC-LoRA Video-to-Video
+# ─────────────────────────────────────────────────────────────────────────────
+class LTX23UnifiedPipeline:
+    """
+    Unified LTX-2.3 pipeline supporting all generation modes:
+      • Text-to-Video
+      • Image-to-Video (first-frame conditioning)
+      • Audio-to-Video (lip-sync / BGM conditioning with external audio)
+      • Video-to-Video (IC-LoRA reference video conditioning)
+      • Any combination of the above
+    Architecture:
+      - stage_1_model_ledger: transformer WITH IC-LoRA fused (used for Stage 1)
+      - stage_2_model_ledger: transformer WITHOUT IC-LoRA (used for Stage 2 upsampling)
+      - When no IC-LoRA is provided, both stages use the same base model.
+    """
+    def __init__(
+        self,
+        distilled_checkpoint_path: str,
+        spatial_upsampler_path: str,
+        gemma_root: str,
+        ic_loras: list[LoraPathStrengthAndSDOps] | None = None,
+        device: torch.device | None = None,
+        quantization: QuantizationPolicy | None = None,
+    ):
+        self.device = device or get_device()
+        self.dtype = torch.bfloat16
+        ic_loras = ic_loras or []
+        self.has_ic_lora = len(ic_loras) > 0
+        # Stage 1: transformer with IC-LoRA (if provided)
+        self.stage_1_model_ledger = ModelLedger(
+            dtype=self.dtype,
+            device=self.device,
+            checkpoint_path=distilled_checkpoint_path,
+            spatial_upsampler_path=spatial_upsampler_path,
+            gemma_root_path=gemma_root,
+            loras=ic_loras,
+            quantization=quantization,
+        )
+        if self.has_ic_lora:
+            # Stage 2 needs a separate transformer WITHOUT IC-LoRA
+            self.stage_2_model_ledger = ModelLedger(
+                dtype=self.dtype,
+                device=self.device,
+                checkpoint_path=distilled_checkpoint_path,
+                spatial_upsampler_path=spatial_upsampler_path,
+                gemma_root_path=gemma_root,
+                loras=[],
+                quantization=quantization,
+            )
+        else:
+            # No IC-LoRA: share a single ledger for both stages (saves ~half VRAM)
+            self.stage_2_model_ledger = self.stage_1_model_ledger
+        self.pipeline_components = PipelineComponents(
+            dtype=self.dtype,
+            device=self.device,
+        )
+        # Read reference downscale factor from IC-LoRA metadata
+        self.reference_downscale_factor = 1
+        for lora in ic_loras:
+            scale = _read_lora_reference_downscale_factor(lora.path)
+            if scale != 1:
+                if self.reference_downscale_factor not in (1, scale):
+                    raise ValueError(
+                        f"Conflicting reference_downscale_factor: "
+                        f"already {self.reference_downscale_factor}, got {scale}"
+                    )
+                self.reference_downscale_factor = scale
+    # ── Video reference conditioning (from ICLoraPipeline) ───────────────
+    def _create_ic_conditionings(
+        self,
+        video_conditioning: list[tuple[str, float]],
+        height: int,
+        width: int,
+        num_frames: int,
+        video_encoder: VideoEncoder,
+        conditioning_strength: float = 1.0,
+    ) -> list[ConditioningItem]:
+        """Create IC-LoRA video reference conditioning items."""
+        conditionings: list[ConditioningItem] = []
+        scale = self.reference_downscale_factor
+        ref_height = height // scale
+        ref_width = width // scale
+        for video_path, strength in video_conditioning:
+            video = load_video_conditioning(
+                video_path=video_path,
+                height=ref_height,
+                width=ref_width,
+                frame_cap=num_frames,
+                dtype=self.dtype,
+                device=self.device,
+            )
+            encoded_video = video_encoder(video)
+            cond = VideoConditionByReferenceLatent(
+                latent=encoded_video,
+                downscale_factor=scale,
+                strength=strength,
+            )
+            if conditioning_strength < 1.0:
+                cond = ConditioningItemAttentionStrengthWrapper(
+                    cond, attention_mask=conditioning_strength
+                )
+            conditionings.append(cond)
+        if conditionings:
+            logging.info(f"[IC-LoRA] Added {len(conditionings)} video conditioning(s)")
+        return conditionings
+    # ── Main generation entry point ──────────────────────────────────────
     def __call__(
         self,
         prompt: str,
         num_frames: int,
         frame_rate: float,
         images: list[ImageConditioningInput],
         audio_path: str | None = None,
+        video_conditioning: list[tuple[str, float]] | None = None,
         tiling_config: TilingConfig | None = None,
+        enhance_prompt: bool = False,
+        conditioning_strength: float = 1.0,
     ):
+        """
+        Generate video with any combination of conditioning.
+        Args:
+            audio_path: Path to external audio file for lipsync/BGM conditioning.
+            video_conditioning: List of (path, strength) tuples for IC-LoRA V2V.
+            conditioning_strength: Scale for IC-LoRA attention influence [0, 1].
+        Returns:
+            Tuple of (decoded_video_iterator, Audio).
+        """
+        assert_resolution(height=height, width=width, is_two_stage=True)
+        has_audio = audio_path is not None
+        has_video_cond = bool(video_conditioning)
         generator = torch.Generator(device=self.device).manual_seed(seed)
         noiser = GaussianNoiser(generator=generator)
         stepper = EulerDiffusionStep()
         dtype = torch.bfloat16
+        # ── Encode text prompt ───────────────────────────────────────────
+        # Use stage_1 ledger for prompt encoding (has text encoder)
         (ctx_p,) = encode_prompts(
             [prompt],
             self.stage_1_model_ledger,
             enhance_first_prompt=enhance_prompt,
             enhance_prompt_image=images[0].path if len(images) > 0 else None,
         )
         video_context, audio_context = ctx_p.video_encoding, ctx_p.audio_encoding
+        # ── Encode external audio (if provided) ─────────────────────────
+        encoded_audio_latent = None
+        decoded_audio_for_output = None
+        if has_audio:
+            video_duration = num_frames / frame_rate
+            decoded_audio = decode_audio_from_file(audio_path, self.device, 0.0, video_duration)
+            if decoded_audio is None:
+                raise ValueError(f"Could not extract audio stream from {audio_path}")
+            encoded_audio_latent = vae_encode_audio(
+                decoded_audio, self.stage_1_model_ledger.audio_encoder()
+            )
+            audio_shape = AudioLatentShape.from_duration(
+                batch=1, duration=video_duration, channels=8, mel_bins=16
+            )
+            expected_frames = audio_shape.frames
+            actual_frames = encoded_audio_latent.shape[2]
+            if actual_frames > expected_frames:
+                encoded_audio_latent = encoded_audio_latent[:, :, :expected_frames, :]
+            elif actual_frames < expected_frames:
+                pad = torch.zeros(
+                    encoded_audio_latent.shape[0], encoded_audio_latent.shape[1],
+                    expected_frames - actual_frames, encoded_audio_latent.shape[3],
+                    device=encoded_audio_latent.device, dtype=encoded_audio_latent.dtype,
+                )
+                encoded_audio_latent = torch.cat([encoded_audio_latent, pad], dim=2)
+            decoded_audio_for_output = Audio(
+                waveform=decoded_audio.waveform.squeeze(0),
+                sampling_rate=decoded_audio.sampling_rate,
             )
+        # ── Build conditionings for Stage 1 ──────────────────────────────
+        # Use stage_1 video encoder (has IC-LoRA context)
         video_encoder = self.stage_1_model_ledger.video_encoder()
+        stage_1_output_shape = VideoPixelShape(
+            batch=1, frames=num_frames,
+            width=width // 2, height=height // 2, fps=frame_rate,
+        )
+        # Image conditionings
+        stage_1_conditionings = combined_image_conditionings(
             images=images,
             height=stage_1_output_shape.height,
             width=stage_1_output_shape.width,
             video_encoder=video_encoder,
+            dtype=dtype,
+            device=self.device,
         )
+        # IC-LoRA video reference conditionings
+        if has_video_cond:
+            ic_conds = self._create_ic_conditionings(
+                video_conditioning=video_conditioning,
+                height=stage_1_output_shape.height,
+                width=stage_1_output_shape.width,
+                num_frames=num_frames,
+                video_encoder=video_encoder,
+                conditioning_strength=conditioning_strength,
+            )
+            stage_1_conditionings.extend(ic_conds)
+        # ── Stage 1: Low-res generation ──────────────────────────────────
         transformer = self.stage_1_model_ledger.transformer()
         stage_1_sigmas = torch.Tensor(DISTILLED_SIGMA_VALUES).to(self.device)
+        def denoising_loop(sigmas, video_state, audio_state, stepper):
             return euler_denoising_loop(
                 sigmas=sigmas,
                 video_state=video_state,
                 ),
             )
+        if has_audio:
+            # Audio mode: denoise video only, use external audio latent
+            video_state = denoise_video_only(
+                output_shape=stage_1_output_shape,
+                conditionings=stage_1_conditionings,
+                noiser=noiser,
+                sigmas=stage_1_sigmas,
+                stepper=stepper,
+                denoising_loop_fn=denoising_loop,
+                components=self.pipeline_components,
+                dtype=dtype,
+                device=self.device,
+                initial_audio_latent=encoded_audio_latent,
+            )
+            audio_state = None  # we'll use the original audio for output
+        else:
+            # Standard / IC-only mode: denoise both audio and video
+            video_state, audio_state = denoise_audio_video(
+                output_shape=stage_1_output_shape,
+                conditionings=stage_1_conditionings,
+                noiser=noiser,
+                sigmas=stage_1_sigmas,
+                stepper=stepper,
+                denoising_loop_fn=denoising_loop,
+                components=self.pipeline_components,
+                dtype=dtype,
+                device=self.device,
+            )
         torch.cuda.synchronize()
         cleanup_memory()
+        # ── Stage 2: Upsample + Refine ──────────────────────────────────
         upscaled_video_latent = upsample_video(
             latent=video_state.latent[:1],
             video_encoder=video_encoder,
         torch.cuda.synchronize()
         cleanup_memory()
+        # Stage 2 uses the transformer WITHOUT IC-LoRA
+        transformer_s2 = self.stage_2_model_ledger.transformer()
         stage_2_sigmas = torch.Tensor(STAGE_2_DISTILLED_SIGMA_VALUES).to(self.device)
+        def denoising_loop_s2(sigmas, video_state, audio_state, stepper):
             return euler_denoising_loop(
                 sigmas=sigmas,
                 video_state=video_state,
                 denoise_fn=simple_denoising_func(
                     video_context=video_context,
                     audio_context=audio_context,
+                    transformer=transformer_s2,
                 ),
             )
+        stage_2_output_shape = VideoPixelShape(
+            batch=1, frames=num_frames,
+            width=width, height=height, fps=frame_rate,
+        )
         stage_2_conditionings = combined_image_conditionings(
             images=images,
             height=stage_2_output_shape.height,
             width=stage_2_output_shape.width,
             video_encoder=video_encoder,
             dtype=dtype,
             device=self.device,
         )
+        if has_audio:
+            video_state = denoise_video_only(
+                output_shape=stage_2_output_shape,
+                conditionings=stage_2_conditionings,
+                noiser=noiser,
+                sigmas=stage_2_sigmas,
+                stepper=stepper,
+                denoising_loop_fn=denoising_loop_s2,
+                components=self.pipeline_components,
+                dtype=dtype,
+                device=self.device,
+                noise_scale=stage_2_sigmas[0],
+                initial_video_latent=upscaled_video_latent,
+                initial_audio_latent=encoded_audio_latent,
+            )
+            audio_state = None
+        else:
+            video_state, audio_state = denoise_audio_video(
+                output_shape=stage_2_output_shape,
+                conditionings=stage_2_conditionings,
+                noiser=noiser,
+                sigmas=stage_2_sigmas,
+                stepper=stepper,
+                denoising_loop_fn=denoising_loop_s2,
+                components=self.pipeline_components,
+                dtype=dtype,
+                device=self.device,
+                noise_scale=stage_2_sigmas[0],
+                initial_video_latent=upscaled_video_latent,
+                initial_audio_latent=audio_state.latent,
+            )
         torch.cuda.synchronize()
+        del transformer, transformer_s2, video_encoder
         cleanup_memory()
+        # ── Decode ───────────────────────────────────────────────────────
         decoded_video = vae_decode_video(
+            video_state.latent,
+            self.stage_2_model_ledger.video_decoder(),
+            tiling_config,
+            generator,
         )
+        if has_audio:
+            output_audio = decoded_audio_for_output
+        else:
+            output_audio = vae_decode_audio(
+                audio_state.latent,
+                self.stage_2_model_ledger.audio_decoder(),
+                self.stage_2_model_ledger.vocoder(),
+            )
+        return decoded_video, output_audio
+# ─────────────────────────────────────────────────────────────────────────────
+# Constants
+# ─────────────────────────���───────────────────────────────────────────────────
+MAX_SEED = np.iinfo(np.int32).max
+DEFAULT_PROMPT = (
+    "An astronaut hatches from a fragile egg on the surface of the Moon, "
+    "the shell cracking and peeling apart in gentle low-gravity motion."
+)
+DEFAULT_FRAME_RATE = 24.0
+RESOLUTIONS = {
+    "high": {"16:9": (1536, 1024), "9:16": (1024, 1536), "1:1": (1024, 1024)},
+    "low": {"16:9": (768, 512), "9:16": (512, 768), "1:1": (768, 768)},
+}
+# Available IC-LoRA models
+IC_LORA_OPTIONS = {
+    "Union Control (Depth + Edge)": {
+        "repo": "Lightricks/LTX-2.3-22b-IC-LoRA-Union-Control",
+        "filename": "ltx-2.3-22b-ic-lora-union-control-ref0.5.safetensors",
+    },
+    "Motion Track Control": {
+        "repo": "Lightricks/LTX-2.3-22b-IC-LoRA-Motion-Track-Control",
+        "filename": "ltx-2.3-22b-ic-lora-motion-track-control-ref0.5.safetensors",
+    },
+}
+DEFAULT_IC_LORA = "Union Control (Depth + Edge)"
+# ─────────────────────────────────────────────────────────────────────────────
+# Download Models
+# ─────────────────────────────────────────────────────────────────────────────
+LTX_MODEL_REPO = "Lightricks/LTX-2.3"
+GEMMA_REPO = "google/gemma-3-12b-it-qat-q4_0-unquantized"
 print("=" * 80)
+print("Downloading LTX-2.3 distilled model + Gemma + IC-LoRA...")
 print("=" * 80)
+checkpoint_path = hf_hub_download(
+    repo_id=LTX_MODEL_REPO, filename="ltx-2.3-22b-distilled.safetensors"
+)
+spatial_upsampler_path = hf_hub_download(
+    repo_id=LTX_MODEL_REPO, filename="ltx-2.3-spatial-upscaler-x2-1.0.safetensors"
+)
+gemma_root = snapshot_download(repo_id=GEMMA_REPO)
+# Download default IC-LoRA
+default_lora_info = IC_LORA_OPTIONS[DEFAULT_IC_LORA]
+default_ic_lora_path = hf_hub_download(
+    repo_id=default_lora_info["repo"], filename=default_lora_info["filename"]
+)
+print(f"Checkpoint: {checkpoint_path}")
+print(f"Spatial upsampler: {spatial_upsampler_path}")
+print(f"Gemma root: {gemma_root}")
+print(f"IC-LoRA: {default_ic_lora_path}")
+# ─────────────────────────────────────────────────────────────────────────────
+# Initialize Pipeline
+# ─────────────────────────────────────────────────────────────────────────────
+ic_loras = [
+    LoraPathStrengthAndSDOps(default_ic_lora_path, 1.0, LTXV_LORA_COMFY_RENAMING_MAP)
+]
+pipeline = LTX23UnifiedPipeline(
+    distilled_checkpoint_path=checkpoint_path,
+    spatial_upsampler_path=spatial_upsampler_path,
+    gemma_root=gemma_root,
+    ic_loras=ic_loras,
+    quantization=QuantizationPolicy.fp8_cast(),
+)
+# Preload all models for ZeroGPU tensor packing.
+print("Preloading all models (including Gemma, Audio encoders)...")
+# Shared ledger: preload once. Separate ledgers (IC-LoRA): preload both.
+_ledger_1 = pipeline.stage_1_model_ledger
+_ledger_2 = pipeline.stage_2_model_ledger
+_shared = _ledger_1 is _ledger_2
+# Stage 1 models (with IC-LoRA if loaded)
+_s1_transformer = _ledger_1.transformer()
+_s1_video_encoder = _ledger_1.video_encoder()
+_s1_text_encoder = _ledger_1.text_encoder()
+_s1_embeddings = _ledger_1.gemma_embeddings_processor()
+_s1_audio_encoder = _ledger_1.audio_encoder()
+_ledger_1.transformer = lambda: _s1_transformer
+_ledger_1.video_encoder = lambda: _s1_video_encoder
+_ledger_1.text_encoder = lambda: _s1_text_encoder
+_ledger_1.gemma_embeddings_processor = lambda: _s1_embeddings
+_ledger_1.audio_encoder = lambda: _s1_audio_encoder
+if _shared:
+    # Single ledger — also preload decoder/upsampler/vocoder on the same object
+    _video_decoder = _ledger_1.video_decoder()
+    _audio_decoder = _ledger_1.audio_decoder()
+    _vocoder = _ledger_1.vocoder()
+    _spatial_upsampler = _ledger_1.spatial_upsampler()
+    _ledger_1.video_decoder = lambda: _video_decoder
+    _ledger_1.audio_decoder = lambda: _audio_decoder
+    _ledger_1.vocoder = lambda: _vocoder
+    _ledger_1.spatial_upsampler = lambda: _spatial_upsampler
+    print("  (single shared ledger — no IC-LoRA)")
+else:
+    # Stage 2 models (separate transformer without IC-LoRA)
+    _s2_transformer = _ledger_2.transformer()
+    _s2_video_encoder = _ledger_2.video_encoder()
+    _s2_video_decoder = _ledger_2.video_decoder()
+    _s2_audio_decoder = _ledger_2.audio_decoder()
+    _s2_vocoder = _ledger_2.vocoder()
+    _s2_spatial_upsampler = _ledger_2.spatial_upsampler()
+    _s2_text_encoder = _ledger_2.text_encoder()
+    _s2_embeddings = _ledger_2.gemma_embeddings_processor()
+    _s2_audio_encoder = _ledger_2.audio_encoder()
+    _ledger_2.transformer = lambda: _s2_transformer
+    _ledger_2.video_encoder = lambda: _s2_video_encoder
+    _ledger_2.video_decoder = lambda: _s2_video_decoder
+    _ledger_2.audio_decoder = lambda: _s2_audio_decoder
+    _ledger_2.vocoder = lambda: _s2_vocoder
+    _ledger_2.spatial_upsampler = lambda: _s2_spatial_upsampler
+    _ledger_2.text_encoder = lambda: _s2_text_encoder
+    _ledger_2.gemma_embeddings_processor = lambda: _s2_embeddings
+    _ledger_2.audio_encoder = lambda: _s2_audio_encoder
+    print("  (two separate ledgers — IC-LoRA active)")
+print("All models preloaded!")
+print("=" * 80)
+# ─────────────────────────────────────────────────────────────────────────────
+# UI Helpers
+# ─────────────────────────────────────────────────────────────────────────────
+def detect_aspect_ratio(media_path) -> str:
     """Detect the closest aspect ratio from an image or video."""
+    if media_path is None:
         return "16:9"
+    ext = str(media_path).lower().rsplit(".", 1)[-1] if "." in str(media_path) else ""
+    # Try as image first
+    if ext in ("jpg", "jpeg", "png", "bmp", "webp", "gif", "tiff"):
+        import PIL.Image
+        try:
+            with PIL.Image.open(media_path) as img:
+                w, h = img.size
+        except Exception:
+            return "16:9"
     else:
+        # Try as video
+        try:
+            import av
+            with av.open(str(media_path)) as container:
+                stream = container.streams.video[0]
+                w, h = stream.codec_context.width, stream.codec_context.height
+        except Exception:
+            # Fallback: try as image anyway
+            import PIL.Image
+            try:
+                with PIL.Image.open(media_path) as img:
+                    w, h = img.size
+            except Exception:
+                return "16:9"
     ratio = w / h
     candidates = {"16:9": 16 / 9, "9:16": 9 / 16, "1:1": 1.0}
     return min(candidates, key=lambda k: abs(ratio - candidates[k]))
+def on_image_upload(image, video, high_res):
     """Auto-set resolution when image is uploaded."""
+    media = image if image is not None else video
+    aspect = detect_aspect_ratio(media)
+    tier = "high" if high_res else "low"
+    w, h = RESOLUTIONS[tier][aspect]
+    return gr.update(value=w), gr.update(value=h)
+def on_video_upload(video, image, high_res):
+    """Auto-set resolution when video is uploaded."""
+    media = video if video is not None else image
+    aspect = detect_aspect_ratio(media)
     tier = "high" if high_res else "low"
     w, h = RESOLUTIONS[tier][aspect]
     return gr.update(value=w), gr.update(value=h)
+def on_highres_toggle(image, video, high_res):
     """Update resolution when high-res toggle changes."""
+    media = image if image is not None else video
+    aspect = detect_aspect_ratio(media)
     tier = "high" if high_res else "low"
     w, h = RESOLUTIONS[tier][aspect]
     return gr.update(value=w), gr.update(value=h)
+# ───────────────────��─────────────────────────────────────────────────────────
+# Generation
+# ─────────────────────────────────────────────────────────────────────────────
+@spaces.GPU(duration=180)
 @torch.inference_mode()
 def generate_video(
     input_image,
+    input_video,
     input_audio,
     prompt: str,
     duration: float,
     conditioning_strength: float,
     enhance_prompt: bool,
     seed: int,
     randomize_seed: bool,
     height: int,
     width: int,
     progress=gr.Progress(track_tqdm=True),
 ):
     try:
         torch.cuda.reset_peak_memory_stats()
         current_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed)
         frame_rate = DEFAULT_FRAME_RATE
         num_frames = int(duration * frame_rate) + 1
         num_frames = ((num_frames - 1 + 7) // 8) * 8 + 1
+        mode_parts = []
+        if input_image is not None:
+            mode_parts.append("Image")
+        if input_video is not None:
+            mode_parts.append("Video(IC-LoRA)")
         if input_audio is not None:
+            mode_parts.append("Audio")
+        if not mode_parts:
+            mode_parts.append("Text")
+        mode_str = " + ".join(mode_parts)
+        print(f"[{mode_str}] Generating: {height}x{width}, {num_frames} frames "
+              f"({duration}s), seed={current_seed}")
+        # Build image conditionings
         images = []
         if input_image is not None:
+            images = [ImageConditioningInput(path=str(input_image), frame_idx=0, strength=1.0)]
+        # Build video conditionings for IC-LoRA / V2V
+        video_conditioning = None
+        if input_video is not None:
+            video_conditioning = [(str(input_video), 1.0)]
         tiling_config = TilingConfig.default()
         video_chunks_number = get_video_chunks_number(num_frames, tiling_config)
+        video, audio = pipeline(
             prompt=prompt,
             seed=current_seed,
             height=int(height),
             num_frames=num_frames,
             frame_rate=frame_rate,
             images=images,
+            audio_path=input_audio,
             video_conditioning=video_conditioning,
             tiling_config=tiling_config,
             enhance_prompt=enhance_prompt,
+            conditioning_strength=conditioning_strength,
         )
         output_path = tempfile.mktemp(suffix=".mp4")
         encode_video(
             video=video,
             video_chunks_number=video_chunks_number,
         )
         return str(output_path), current_seed
     except Exception as e:
         import traceback
         print(f"Error: {str(e)}\n{traceback.format_exc()}")
         return None, current_seed
+# ─────────────────────────────────────────────────────────────────────────────
+# Gradio UI
+# ─────────────────────────────────────────────────────────────────────────────
+with gr.Blocks(title="LTX-2.3 Unified: V2V + I2V + A2V") as demo:
+    gr.Markdown("# LTX-2.3 Unified: Video/Image/Audio → Video")
     gr.Markdown(
+        "Unified pipeline for **video-to-video** (IC-LoRA), **image-to-video**, "
+        "and **audio-conditioned** generation with LTX-2.3 — use any combination of inputs. "
         "[[model]](https://huggingface.co/Lightricks/LTX-2.3) "
         "[[code]](https://github.com/Lightricks/LTX-2)"
     )
     with gr.Row():
         with gr.Column():
+            # All three inputs visible at once
             with gr.Row():
+                input_image = gr.Image(
+                    label="🖼️ Input Image (I2V — first frame)",
+                    type="filepath",
+                )
+                input_video = gr.Video(
+                    label="🎬 Reference Video (V2V — IC-LoRA)",
+                    sources=["upload"],
                 )
+            input_audio = gr.Audio(
+                label="🔊 Input Audio (A2V — lipsync / BGM)",
+                type="filepath",
             )
             prompt = gr.Textbox(
                 label="Prompt",
+                info="Describe the desired output — be as detailed as possible",
+                value="Make this come alive with cinematic motion, smooth animation",
                 lines=3,
+                placeholder="Describe the motion, style, and content you want...",
             )
             with gr.Row():
+                duration = gr.Slider(
+                    label="Duration (seconds)",
+                    minimum=1.0, maximum=10.0, value=3.0, step=0.1,
+                )
+                conditioning_strength = gr.Slider(
+                    label="V2V Conditioning Strength",
+                    info="How closely to follow the reference video",
+                    minimum=0.0, maximum=1.0, value=1.0, step=0.05,
                 )
+            with gr.Row():
+                enhance_prompt = gr.Checkbox(label="Enhance Prompt", value=True)
+                high_res = gr.Checkbox(label="High Resolution", value=True)
             generate_btn = gr.Button("Generate Video", variant="primary", size="lg")
             with gr.Accordion("Advanced Settings", open=False):
+                seed = gr.Slider(
+                    label="Seed", minimum=0, maximum=MAX_SEED, value=42, step=1,
+                )
                 randomize_seed = gr.Checkbox(label="Randomize Seed", value=True)
                 with gr.Row():
                     width = gr.Number(label="Width", value=1536, precision=0)
                     height = gr.Number(label="Height", value=1024, precision=0)
         with gr.Column():
             output_video = gr.Video(label="Generated Video", autoplay=True)
+    # ── Event handlers ───────────────────────────────────────────────────
     input_image.change(
+        fn=on_image_upload,
+        inputs=[input_image, input_video, high_res],
+        outputs=[width, height],
+    )
+    input_video.change(
+        fn=on_video_upload,
+        inputs=[input_video, input_image, high_res],
         outputs=[width, height],
     )
     high_res.change(
         fn=on_highres_toggle,
+        inputs=[input_image, input_video, high_res],
         outputs=[width, height],
     )
     generate_btn.click(
         fn=generate_video,
         inputs=[
+            input_image, input_video, input_audio, prompt, duration,
+            conditioning_strength, enhance_prompt,
             seed, randomize_seed, height, width,
         ],
         outputs=[output_video, seed],
     )
 if __name__ == "__main__":
+    demo.launch(theme=gr.themes.Citrus())