TestingwithNeg

Sleeping

App Files Files Community

dagloop5 commited on Apr 18

Commit

287d66d

verified ·

1 Parent(s): 9d8a7db

Delete app.py

Browse files

Files changed (1) hide show

app.py +0 -927

app.py DELETED Viewed

@@ -1,927 +0,0 @@
-# =============================================================================
-# Installation and Setup
-# =============================================================================
-import os
-import subprocess
-import sys
-# Disable torch.compile / dynamo before any torch import
-os.environ["TORCH_COMPILE_DISABLE"] = "1"
-os.environ["TORCHDYNAMO_DISABLE"] = "1"
-# Clone LTX-2 repo at specific commit
-LTX_REPO_URL = "https://github.com/Lightricks/LTX-2.git"
-LTX_REPO_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "LTX-2")
-LTX_COMMIT_SHA = "a2c3f24078eb918171967f74b6f66b756b29ee45"
-if not os.path.exists(LTX_REPO_DIR):
-    print(f"Cloning {LTX_REPO_URL} at commit {LTX_COMMIT_SHA}...")
-    os.makedirs(LTX_REPO_DIR)
-    subprocess.run(["git", "init", LTX_REPO_DIR], check=True)
-    subprocess.run(["git", "remote", "add", "origin", LTX_REPO_URL], cwd=LTX_REPO_DIR, check=True)
-    subprocess.run(["git", "fetch", "--depth", "1", "origin", LTX_COMMIT_SHA], cwd=LTX_REPO_DIR, check=True)
-    subprocess.run(["git", "checkout", LTX_COMMIT_SHA], cwd=LTX_REPO_DIR, check=True)
-sys.path.insert(0, os.path.join(LTX_REPO_DIR, "packages", "ltx-pipelines", "src"))
-sys.path.insert(0, os.path.join(LTX_REPO_DIR, "packages", "ltx-core", "src"))
-# =============================================================================
-# Imports
-# =============================================================================
-import logging
-import random
-import tempfile
-from pathlib import Path
-from typing import Optional, Any
-import torch
-torch._dynamo.config.suppress_errors = True
-torch._dynamo.config.disable = True
-import gradio as gr
-import spaces
-import numpy as np
-from huggingface_hub import hf_hub_download, snapshot_download
-# Core LTX imports
-from ltx_core.model.video_vae import TilingConfig, get_video_chunks_number
-from ltx_core.quantization import QuantizationPolicy
-from ltx_core.loader import LoraPathStrengthAndSDOps, LTXV_LORA_COMFY_RENAMING_MAP
-from ltx_core.components.guiders import MultiModalGuider, MultiModalGuiderParams
-from ltx_core.components.noisers import GaussianNoiser
-from ltx_core.components.schedulers import LTX2Scheduler
-from ltx_core.components.diffusion_steps import Res2sDiffusionStep
-from ltx_core.types import Audio, VideoLatentShape, VideoPixelShape
-# Pipeline utilities
-from ltx_pipelines.utils.args import ImageConditioningInput
-from ltx_pipelines.utils.media_io import encode_video
-from ltx_pipelines.utils.denoisers import GuidedDenoiser, SimpleDenoiser
-from ltx_pipelines.utils.samplers import res2s_audio_video_denoising_loop
-from ltx_pipelines.utils.types import ModalitySpec
-from ltx_pipelines.utils.helpers import assert_resolution, combined_image_conditionings, get_device
-from ltx_pipelines.utils.constants import LTX_2_3_HQ_PARAMS, STAGE_2_DISTILLED_SIGMA_VALUES
-# Model builders
-from ltx_core.loader.single_gpu_model_builder import (
-    TransformerBuilder,
-    VideoEncoderBuilder,
-    VideoDecoderBuilder,
-    AudioDecoderBuilder,
-    VocoderBuilder,
-    UpsamplerBuilder,
-    TextEncoderBuilder,
-)
-from ltx_core.model.transformer import X0Model
-from ltx_core.model.video_vae import VideoEncoder, VideoDecoder
-from ltx_core.model.audio_vae import AudioDecoder as AVAudioDecoder, Vocoder
-from ltx_core.model.upsampler import LatentUpsampler
-from ltx_core.text_encoders.gemma import GemmaTextEncoder
-from ltx_core.text_encoders.gemma.embeddings_processor import EmbeddingsProcessorBuilder
-logging.getLogger().setLevel(logging.INFO)
-# =============================================================================
-# Constants and Configuration
-# =============================================================================
-LTX_MODEL_REPO = "Lightricks/LTX-2.3"
-GEMMA_REPO = "Lightricks/gemma-3-12b-it-qat-q4_0-unquantized"
-DEFAULT_FRAME_RATE = 24.0
-MIN_DIM, MAX_DIM, STEP = 256, 1280, 64
-MIN_FRAMES, MAX_FRAMES = 9, 257
-MAX_SEED = np.iinfo(np.int32).max
-DEFAULT_PROMPT = (
-    "A majestic eagle soaring over mountain peaks at sunset, "
-    "wings spread wide against the orange sky, feathers catching the light, "
-    "wind currents visible in the motion blur, cinematic slow motion, 4K quality"
-)
-DEFAULT_NEGATIVE_PROMPT = (
-    "worst quality, inconsistent motion, blurry, jittery, distorted, "
-    "deformed, artifacts, text, watermark, logo, frame, border, "
-    "low resolution, pixelated, unnatural, fake, CGI, cartoon"
-)
-# =============================================================================
-# HQ Pipeline with model_ledger - Custom Implementation
-# =============================================================================
-class HQModelLedger:
-    """
-    Model ledger that stores preloaded models for ZeroGPU tensor packing.
-    Mimics the pattern used in DistilledPipeline's official Space.
-    """
-    def __init__(
-        self,
-        checkpoint_path: str,
-        distilled_lora_path: str,
-        distilled_lora_strength_stage_1: float,
-        distilled_lora_strength_stage_2: float,
-        spatial_upsampler_path: str,
-        gemma_root: str,
-        loras: tuple,
-        device: torch.device,
-        dtype: torch.dtype,
-        quantization: Optional[QuantizationPolicy] = None,
-    ):
-        self.device = device
-        self.dtype = dtype
-        self._target_device = device
-        self._checkpoint_path = checkpoint_path
-        self._spatial_upsampler_path = spatial_upsampler_path
-        self._gemma_root = gemma_root
-        self._quantization = quantization
-        # Cached models (set to None initially)
-        self._transformer_stage1 = None
-        self._transformer_stage2 = None
-        self._video_encoder = None
-        self._video_decoder = None
-        self._audio_decoder = None
-        self._vocoder = None
-        self._spatial_upsampler = None
-        self._text_encoder = None
-        self._embeddings_processor = None
-        # LoRA configurations
-        self._distilled_lora_path = distilled_lora_path
-        self._distilled_lora_strength_stage_1 = distilled_lora_strength_stage_1
-        self._distilled_lora_strength_stage_2 = distilled_lora_strength_stage_2
-        self._loras = loras
-        # Build configurations
-        self._build_configs()
-    def _build_configs(self):
-        """Create builder configurations with LoRAs."""
-        # Stage 1 LoRA list
-        stage1_loras = [
-            LoraPathStrengthAndSDOps(
-                path=self._distilled_lora_path,
-                strength=self._distilled_lora_strength_stage_1,
-                sd_ops=LTXV_LORA_COMFY_RENAMING_MAP,
-            )
-        ]
-        # Add custom loras
-        for lora in self._loras:
-            stage1_loras.append(lora)
-        # Stage 2 LoRA list (different strength)
-        stage2_loras = [
-            LoraPathStrengthAndSDOps(
-                path=self._distilled_lora_path,
-                strength=self._distilled_lora_strength_stage_2,
-                sd_ops=LTXV_LORA_COMFY_RENAMING_MAP,
-            )
-        ]
-        for lora in self._loras:
-            stage2_loras.append(lora)
-        # Transformer builder for stage 1
-        self._transformer_builder_stage1 = (
-            TransformerBuilder.from_checkpoint(self._checkpoint_path)
-            .with_loras(stage1_loras)
-        )
-        # Transformer builder for stage 2
-        self._transformer_builder_stage2 = (
-            TransformerBuilder.from_checkpoint(self._checkpoint_path)
-            .with_loras(stage2_loras)
-        )
-        # Other builders (no LoRAs)
-        self._video_encoder_builder = VideoEncoderBuilder.from_checkpoint(self._checkpoint_path)
-        self._video_decoder_builder = VideoDecoderBuilder.from_checkpoint(self._checkpoint_path)
-        self._audio_decoder_builder = AudioDecoderBuilder.from_checkpoint(self._checkpoint_path)
-        self._vocoder_builder = VocoderBuilder.from_checkpoint(self._checkpoint_path)
-        self._spatial_upsampler_builder = UpsamplerBuilder.from_checkpoint(self._spatial_upsampler_path)
-        self._text_encoder_builder = TextEncoderBuilder.from_gemma(self._gemma_root)
-        self._embeddings_processor_builder = EmbeddingsProcessorBuilder.from_checkpoint(self._checkpoint_path)
-    def transformer(self, stage: int = 1):
-        """Get or build transformer model."""
-        if stage == 1:
-            if self._transformer_stage1 is None:
-                print("    Building transformer (stage 1)...")
-                model = self._transformer_builder_stage1.build(
-                    device=self._target_device,
-                    dtype=self.dtype,
-                )
-                self._transformer_stage1 = X0Model(model).to(self.device).eval()
-            return self._transformer_stage1
-        else:
-            if self._transformer_stage2 is None:
-                print("    Building transformer (stage 2)...")
-                model = self._transformer_builder_stage2.build(
-                    device=self._target_device,
-                    dtype=self.dtype,
-                )
-                self._transformer_stage2 = X0Model(model).to(self.device).eval()
-            return self._transformer_stage2
-    def video_encoder(self):
-        """Get or build video encoder."""
-        if self._video_encoder is None:
-            print("    Building video encoder...")
-            self._video_encoder = self._video_encoder_builder.build(
-                device=self._target_device,
-                dtype=self.dtype,
-            ).to(self.device).eval()
-        return self._video_encoder
-    def video_decoder(self):
-        """Get or build video decoder."""
-        if self._video_decoder is None:
-            print("    Building video decoder...")
-            self._video_decoder = self._video_decoder_builder.build(
-                device=self._target_device,
-                dtype=self.dtype,
-            ).to(self.device).eval()
-        return self._video_decoder
-    def audio_decoder(self):
-        """Get or build audio decoder."""
-        if self._audio_decoder is None:
-            print("    Building audio decoder...")
-            self._audio_decoder = self._audio_decoder_builder.build(
-                device=self._target_device,
-                dtype=self.dtype,
-            ).to(self.device).eval()
-        return self._audio_decoder
-    def vocoder(self):
-        """Get or build vocoder."""
-        if self._vocoder is None:
-            print("    Building vocoder...")
-            self._vocoder = self._vocoder_builder.build(
-                device=self._target_device,
-                dtype=self.dtype,
-            ).to(self.device).eval()
-        return self._vocoder
-    def spatial_upsampler(self):
-        """Get or build spatial upsampler."""
-        if self._spatial_upsampler is None:
-            print("    Building spatial upsampler...")
-            self._spatial_upsampler = self._spatial_upsampler_builder.build(
-                device=self._target_device,
-                dtype=self.dtype,
-            ).to(self.device).eval()
-        return self._spatial_upsampler
-    def text_encoder(self):
-        """Get or build text encoder."""
-        if self._text_encoder is None:
-            print("    Building text encoder (Gemma)...")
-            self._text_encoder = self._text_encoder_builder.build(
-                device=self._target_device,
-                dtype=self.dtype,
-            ).to(self.device).eval()
-        return self._text_encoder
-    def embeddings_processor(self):
-        """Get or build embeddings processor."""
-        if self._embeddings_processor is None:
-            print("    Building embeddings processor...")
-            self._embeddings_processor = self._embeddings_processor_builder.build(
-                device=self._target_device,
-                dtype=self.dtype,
-            ).to(self.device).eval()
-        return self._embeddings_processor
-class TI2VidTwoStagesHQPipelineWithLedger:
-    """
-    Two-stage text/image-to-video generation pipeline using model_ledger.
-    Same as TI2VidTwoStagesHQPipeline but uses model_ledger for ZeroGPU compatibility.
-    """
-    def __init__(
-        self,
-        checkpoint_path: str,
-        distilled_lora_path: str,
-        distilled_lora_strength_stage_1: float,
-        distilled_lora_strength_stage_2: float,
-        spatial_upsampler_path: str,
-        gemma_root: str,
-        loras: tuple = (),
-        device: Optional[torch.device] = None,
-        quantization: Optional[QuantizationPolicy] = None,
-        torch_compile: bool = False,
-    ):
-        self.device = device or get_device()
-        self.dtype = torch.bfloat16
-        self._torch_compile = torch_compile
-        # Create model ledger
-        self.model_ledger = HQModelLedger(
-            checkpoint_path=checkpoint_path,
-            distilled_lora_path=distilled_lora_path,
-            distilled_lora_strength_stage_1=distilled_lora_strength_stage_1,
-            distilled_lora_strength_stage_2=distilled_lora_strength_stage_2,
-            spatial_upsampler_path=spatial_upsampler_path,
-            gemma_root=gemma_root,
-            loras=loras,
-            device=self.device,
-            dtype=self.dtype,
-            quantization=quantization,
-        )
-        # Scheduler and stepper
-        self._scheduler = LTX2Scheduler()
-        self._stepper = Res2sDiffusionStep()
-    @torch.inference_mode()
-    def __call__(
-        self,
-        prompt: str,
-        negative_prompt: str,
-        seed: int,
-        height: int,
-        width: int,
-        num_frames: int,
-        frame_rate: float,
-        num_inference_steps: int,
-        video_guider_params: MultiModalGuiderParams,
-        audio_guider_params: MultiModalGuiderParams,
-        images: list[ImageConditioningInput],
-        tiling_config: Optional[TilingConfig] = None,
-        enhance_prompt: bool = False,
-        streaming_prefetch_count: Optional[int] = None,
-        max_batch_size: int = 1,
-    ):
-        assert_resolution(height=height, width=width, is_two_stage=True)
-        generator = torch.Generator(device=self.device).manual_seed(seed)
-        noiser = GaussianNoiser(generator=generator)
-        # Get models from ledger
-        text_encoder = self.model_ledger.text_encoder()
-        embeddings_processor = self.model_ledger.embeddings_processor()
-        video_encoder = self.model_ledger.video_encoder()
-        # Encode prompts
-        # Encode positive prompt
-        ctx_p = embeddings_processor.create_embeddings(
-            text_encoder([prompt]),
-            video_encoder,
-            images[0].path if len(images) > 0 and enhance_prompt else None,
-            seed if enhance_prompt else None,
-        )
-        # Encode negative prompt
-        ctx_n = embeddings_processor.create_embeddings(
-            text_encoder([negative_prompt]),
-            video_encoder,
-            None,
-            None,
-        )
-        v_context_p, a_context_p = ctx_p.video_encoding, ctx_p.audio_encoding
-        v_context_n, a_context_n = ctx_n.video_encoding, ctx_n.audio_encoding
-        # Stage 1: Generate at half resolution with CFG
-        stage_1_output_shape = VideoPixelShape(
-            batch=1,
-            frames=num_frames,
-            width=width // 2,
-            height=height // 2,
-            fps=frame_rate,
-        )
-        stage_1_conditionings = combined_image_conditionings(
-            images=images,
-            height=stage_1_output_shape.height,
-            width=stage_1_output_shape.width,
-            video_encoder=video_encoder,
-            dtype=self.dtype,
-            device=self.device,
-        )
-        empty_latent = torch.empty(
-            VideoLatentShape.from_pixel_shape(stage_1_output_shape).to_torch_shape(),
-            dtype=self.dtype,
-            device=self.device,
-        )
-        sigmas = self._scheduler.execute(latent=empty_latent, steps=num_inference_steps)
-        sigmas = sigmas.to(dtype=torch.float32, device=self.device)
-        transformer = self.model_ledger.transformer(stage=1)
-        video_state, audio_state = res2s_audio_video_denoising_loop(
-            transformer=transformer,
-            denoiser=GuidedDenoiser(
-                v_context=v_context_p,
-                a_context=a_context_p,
-                video_guider=MultiModalGuider(params=video_guider_params, negative_context=v_context_n),
-                audio_guider=MultiModalGuider(params=audio_guider_params, negative_context=a_context_n),
-            ),
-            sigmas=sigmas,
-            noiser=noiser,
-            stepper=self._stepper,
-            width=stage_1_output_shape.width,
-            height=stage_1_output_shape.height,
-            frames=num_frames,
-            fps=frame_rate,
-            video=ModalitySpec(context=v_context_p, conditionings=stage_1_conditionings),
-            audio=ModalitySpec(context=a_context_p),
-            streaming_prefetch_count=streaming_prefetch_count,
-            max_batch_size=max_batch_size,
-        )
-        # Stage 2: Upscale and refine
-        upscaled_video_latent = self.model_ledger.spatial_upsampler()(video_state.latent[:1])
-        distilled_sigmas = torch.tensor(STAGE_2_DISTILLED_SIGMA_VALUES, device=self.device, dtype=torch.float32)
-        stage_2_conditionings = combined_image_conditionings(
-            images=images,
-            height=height,
-            width=width,
-            video_encoder=video_encoder,
-            dtype=self.dtype,
-            device=self.device,
-        )
-        transformer = self.model_ledger.transformer(stage=2)
-        video_state, audio_state = res2s_audio_video_denoising_loop(
-            transformer=transformer,
-            denoiser=SimpleDenoiser(v_context=v_context_p, a_context=a_context_p),
-            sigmas=distilled_sigmas,
-            noiser=noiser,
-            stepper=self._stepper,
-            width=width,
-            height=height,
-            frames=num_frames,
-            fps=frame_rate,
-            video=ModalitySpec(
-                context=v_context_p,
-                conditionings=stage_2_conditionings,
-                noise_scale=distilled_sigmas[0].item(),
-                initial_latent=upscaled_video_latent,
-            ),
-            audio=ModalitySpec(
-                context=a_context_p,
-                noise_scale=distilled_sigmas[0].item(),
-                initial_latent=audio_state.latent,
-            ),
-            streaming_prefetch_count=streaming_prefetch_count,
-            max_batch_size=max_batch_size,
-        )
-        # Decode
-        video_decoder = self.model_ledger.video_decoder()
-        audio_decoder = self.model_ledger.audio_decoder()
-        decoded_video = video_decoder(video_state.latent, tiling_config, generator)
-        decoded_audio = audio_decoder(audio_state.latent)
-        return decoded_video, decoded_audio
-# =============================================================================
-# Model Download
-# =============================================================================
-print("=" * 80)
-print("Downloading LTX-2.3 models...")
-print("=" * 80)
-checkpoint_path = hf_hub_download(repo_id=LTX_MODEL_REPO, filename="ltx-2.3-22b-dev.safetensors")
-spatial_upsampler_path = hf_hub_download(repo_id=LTX_MODEL_REPO, filename="ltx-2.3-spatial-upscaler-x2-1.1.safetensors")
-distilled_lora_path = hf_hub_download(repo_id=LTX_MODEL_REPO, filename="ltx-2.3-22b-distilled-lora-384.safetensors")
-gemma_root = snapshot_download(repo_id=GEMMA_REPO)
-print(f"Checkpoint: {checkpoint_path}")
-print(f"Spatial upsampler: {spatial_upsampler_path}")
-print(f"Distilled LoRA: {distilled_lora_path}")
-print(f"Gemma root: {gemma_root}")
-print("=" * 80)
-print("All models downloaded!")
-print("=" * 80)
-# =============================================================================
-# Pipeline Initialization
-# =============================================================================
-print("Initializing TI2VidTwoStagesHQPipelineWithLedger...")
-pipeline = TI2VidTwoStagesHQPipelineWithLedger(
-    checkpoint_path=checkpoint_path,
-    distilled_lora_path=distilled_lora_path,
-    distilled_lora_strength_stage_1=0.25,
-    distilled_lora_strength_stage_2=0.50,
-    spatial_upsampler_path=spatial_upsampler_path,
-    gemma_root=gemma_root,
-    loras=(),
-    quantization=QuantizationPolicy.fp8_cast(),
-    torch_compile=False,
-)
-print("Pipeline initialized successfully!")
-print("=" * 80)
-# =============================================================================
-# ZeroGPU Tensor Preloading - model_ledger Pattern
-# =============================================================================
-print("Preloading all models for ZeroGPU tensor packing...")
-print("This may take a few minutes...")
-# Access model ledger
-ledger = pipeline.model_ledger
-# Preload all models - this mimics the official Space's pattern
-print("  Loading transformer (stage 1)...")
-_transformer_s1 = ledger.transformer(stage=1)
-ledger._transformer_stage1 = _transformer_s1
-print("  Loading transformer (stage 2)...")
-_transformer_s2 = ledger.transformer(stage=2)
-ledger._transformer_stage2 = _transformer_s2
-print("  Loading video encoder...")
-_ve = ledger.video_encoder()
-ledger._video_encoder = _ve
-print("  Loading video decoder...")
-_vd = ledger.video_decoder()
-ledger._video_decoder = _vd
-print("  Loading audio decoder...")
-_ad = ledger.audio_decoder()
-ledger._audio_decoder = _ad
-print("  Loading vocoder...")
-_voc = ledger.vocoder()
-ledger._vocoder = _voc
-print("  Loading spatial upsampler...")
-_su = ledger.spatial_upsampler()
-ledger._spatial_upsampler = _su
-print("  Loading text encoder (Gemma)...")
-_te = ledger.text_encoder()
-ledger._text_encoder = _te
-print("  Loading embeddings processor...")
-_ep = ledger.embeddings_processor()
-ledger._embeddings_processor = _ep
-# Replace methods with lambdas to prevent garbage collection
-# This is the CRITICAL step that makes ZeroGPU tensor packing work
-def ledger_transformer(stage=1):
-    return ledger._transformer_stage1 if stage == 1 else ledger._transformer_stage2
-ledger.transformer = ledger_transformer
-ledger.video_encoder = lambda: ledger._video_encoder
-ledger.video_decoder = lambda: ledger._video_decoder
-ledger.audio_decoder = lambda: ledger._audio_decoder
-ledger.vocoder = lambda: ledger._vocoder
-ledger.spatial_upsampler = lambda: ledger._spatial_upsampler
-ledger.text_encoder = lambda: ledger._text_encoder
-ledger.embeddings_processor = lambda: ledger._embeddings_processor
-print("All models preloaded for ZeroGPU tensor packing!")
-print("=" * 80)
-# =============================================================================
-# Helper Functions
-# =============================================================================
-def log_memory(tag: str):
-    if torch.cuda.is_available():
-        allocated = torch.cuda.memory_allocated() / 1024**3
-        peak = torch.cuda.max_memory_allocated() / 1024**3
-        free, total = torch.cuda.mem_get_info()
-        print(f"[VRAM {tag}] allocated={allocated:.2f}GB peak={peak:.2f}GB free={free / 1024**3:.2f}GB total={total / 1024**3:.2f}GB")
-def calculate_frames(duration: float, frame_rate: float = DEFAULT_FRAME_RATE) -> int:
-    ideal_frames = int(duration * frame_rate)
-    ideal_frames = max(ideal_frames, MIN_FRAMES)
-    k = round((ideal_frames - 1) / 8)
-    frames = k * 8 + 1
-    return min(frames, MAX_FRAMES)
-def validate_resolution(height: int, width: int) -> tuple[int, int]:
-    height = round(height / STEP) * STEP
-    width = round(width / STEP) * STEP
-    height = max(MIN_DIM, min(height, MAX_DIM))
-    width = max(MIN_DIM, min(width, MAX_DIM))
-    return height, width
-def detect_aspect_ratio(image) -> str:
-    if image is None:
-        return "16:9"
-    if hasattr(image, "size"):
-        w, h = image.size
-    elif hasattr(image, "shape"):
-        h, w = image.shape[:2]
-    else:
-        return "16:9"
-    ratio = w / h
-    candidates = {"16:9": 16/9, "9:16": 9/16, "1:1": 1.0}
-    return min(candidates, key=lambda k: abs(ratio - candidates[k]))
-RESOLUTIONS = {
-    "16:9": {"width": 1280, "height": 704},
-    "9:16": {"width": 704, "height": 1280},
-    "1:1": {"width": 960, "height": 960},
-}
-def get_duration(
-    prompt: str,
-    negative_prompt: str,
-    input_image,
-    duration: float,
-    seed: int,
-    randomize_seed: bool,
-    height: int,
-    width: int,
-    enhance_prompt: bool,
-    video_cfg_scale: float,
-    video_stg_scale: float,
-    video_rescale_scale: float,
-    video_a2v_scale: float,
-    audio_cfg_scale: float,
-    audio_stg_scale: float,
-    audio_rescale_scale: float,
-    audio_v2a_scale: float,
-    progress,
-) -> int:
-    base = 60
-    if duration > 4:
-        base += 15
-    if duration > 6:
-        base += 15
-    if height > 700 or width > 1000:
-        base += 15
-    frames_from_duration = int(duration * DEFAULT_FRAME_RATE)
-    if frames_from_duration > 81:
-        base += 10
-    return min(base, 90)
-@spaces.GPU(duration=get_duration)
-@torch.inference_mode()
-def generate_video(
-    prompt: str,
-    negative_prompt: str,
-    input_image,
-    duration: float,
-    seed: int,
-    randomize_seed: bool,
-    height: int,
-    width: int,
-    enhance_prompt: bool,
-    video_cfg_scale: float,
-    video_stg_scale: float,
-    video_rescale_scale: float,
-    video_a2v_scale: float,
-    audio_cfg_scale: float,
-    audio_stg_scale: float,
-    audio_rescale_scale: float,
-    audio_v2a_scale: float,
-    progress=gr.Progress(track_tqdm=True),
-):
-    try:
-        torch.cuda.reset_peak_memory_stats()
-        log_memory("start")
-        current_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed)
-        print(f"Using seed: {current_seed}")
-        height, width = validate_resolution(int(height), int(width))
-        print(f"Resolution: {width}x{height}")
-        num_frames = calculate_frames(duration, DEFAULT_FRAME_RATE)
-        print(f"Frames: {num_frames} ({duration}s @ {DEFAULT_FRAME_RATE}fps)")
-        images = []
-        if input_image is not None:
-            output_dir = Path("outputs")
-            output_dir.mkdir(exist_ok=True)
-            temp_image_path = output_dir / f"temp_input_{current_seed}.jpg"
-            if hasattr(input_image, "save"):
-                input_image.save(temp_image_path)
-            else:
-                import shutil
-                shutil.copy(input_image, temp_image_path)
-            images = [ImageConditioningInput(
-                path=str(temp_image_path),
-                frame_idx=0,
-                strength=1.0
-            )]
-        tiling_config = TilingConfig.default()
-        video_chunks_number = get_video_chunks_number(num_frames, tiling_config)
-        video_guider_params = MultiModalGuiderParams(
-            cfg_scale=video_cfg_scale,
-            stg_scale=video_stg_scale,
-            rescale_scale=video_rescale_scale,
-            modality_scale=video_a2v_scale,
-            skip_step=0,
-            stg_blocks=[],
-        )
-        audio_guider_params = MultiModalGuiderParams(
-            cfg_scale=audio_cfg_scale,
-            stg_scale=audio_stg_scale,
-            rescale_scale=audio_rescale_scale,
-            modality_scale=audio_v2a_scale,
-            skip_step=0,
-            stg_blocks=[],
-        )
-        log_memory("before pipeline call")
-        video, audio = pipeline(
-            prompt=prompt,
-            negative_prompt=negative_prompt,
-            seed=current_seed,
-            height=height,
-            width=width,
-            num_frames=num_frames,
-            frame_rate=DEFAULT_FRAME_RATE,
-            num_inference_steps=LTX_2_3_HQ_PARAMS.num_inference_steps,
-            video_guider_params=video_guider_params,
-            audio_guider_params=audio_guider_params,
-            images=images,
-            tiling_config=tiling_config,
-            enhance_prompt=enhance_prompt,
-        )
-        log_memory("after pipeline call")
-        output_path = tempfile.mktemp(suffix=".mp4")
-        encode_video(
-            video=video,
-            fps=DEFAULT_FRAME_RATE,
-            audio=audio,
-            output_path=output_path,
-            video_chunks_number=video_chunks_number,
-        )
-        log_memory("after encode_video")
-        return str(output_path), current_seed
-    except Exception as e:
-        import traceback
-        log_memory("on error")
-        print(f"Error: {str(e)}\n{traceback.format_exc()}")
-        return None, current_seed
-# =============================================================================
-# Gradio UI
-# =============================================================================
-css = """
-.fillable {max-width: 1200px !important}
-.progress-text {color: white}
-"""
-with gr.Blocks(title="LTX-2.3 Two-Stage HQ Video Generation") as demo:
-    gr.Markdown("# LTX-2.3 Two-Stage HQ Video Generation")
-    gr.Markdown(
-        "High-quality text/image-to-video generation using the dev model + distilled LoRA. "
-        "[[Model]](https://huggingface.co/Lightricks/LTX-2.3) "
-        "[[GitHub]](https://github.com/Lightricks/LTX-2)"
-    )
-    with gr.Row():
-        with gr.Column():
-            input_image = gr.Image(
-                label="Input Image (Optional - for image-to-video)",
-                type="pil",
-                sources=["upload", "webcam", "clipboard"]
-            )
-            prompt = gr.Textbox(
-                label="Prompt",
-                info="Describe the video you want to generate",
-                value=DEFAULT_PROMPT,
-                lines=3,
-                placeholder="Enter your prompt here..."
-            )
-            negative_prompt = gr.Textbox(
-                label="Negative Prompt",
-                info="What to avoid in the generated video",
-                value=DEFAULT_NEGATIVE_PROMPT,
-                lines=2,
-            )
-            duration = gr.Slider(
-                label="Duration (seconds)",
-                minimum=0.5,
-                maximum=8.0,
-                value=2.0,
-                step=0.1,
-            )
-            enhance_prompt = gr.Checkbox(
-                label="Enhance Prompt",
-                value=False,
-            )
-            generate_btn = gr.Button("Generate Video", variant="primary", size="lg")
-        with gr.Column():
-            output_video = gr.Video(
-                label="Generated Video",
-                autoplay=True,
-                interactive=False
-            )
-    with gr.Accordion("Advanced Settings", open=False):
-        with gr.Row():
-            width = gr.Number(label="Width", value=1280, precision=0)
-            height = gr.Number(label="Height", value=704, precision=0)
-        with gr.Row():
-            seed = gr.Number(label="Seed", value=42, precision=0, minimum=0, maximum=MAX_SEED)
-            randomize_seed = gr.Checkbox(label="Randomize Seed", value=True)
-        gr.Markdown("### Video Guidance Parameters")
-        with gr.Row():
-            video_cfg_scale = gr.Slider(
-                label="Video CFG Scale", minimum=1.0, maximum=10.0,
-                value=LTX_2_3_HQ_PARAMS.video_guider_params.cfg_scale, step=0.1
-            )
-            video_stg_scale = gr.Slider(
-                label="Video STG Scale", minimum=0.0, maximum=2.0, value=0.0, step=0.1
-            )
-        with gr.Row():
-            video_rescale_scale = gr.Slider(
-                label="Video Rescale", minimum=0.0, maximum=2.0, value=0.45, step=0.1
-            )
-            video_a2v_scale = gr.Slider(
-                label="A2V Scale", minimum=0.0, maximum=5.0, value=3.0, step=0.1
-            )
-        gr.Markdown("### Audio Guidance Parameters")
-        with gr.Row():
-            audio_cfg_scale = gr.Slider(
-                label="Audio CFG Scale", minimum=1.0, maximum=15.0,
-                value=LTX_2_3_HQ_PARAMS.audio_guider_params.cfg_scale, step=0.1
-            )
-            audio_stg_scale = gr.Slider(
-                label="Audio STG Scale", minimum=0.0, maximum=2.0, value=0.0, step=0.1
-            )
-        with gr.Row():
-            audio_rescale_scale = gr.Slider(
-                label="Audio Rescale", minimum=0.0, maximum=2.0, value=1.0, step=0.1
-            )
-            audio_v2a_scale = gr.Slider(
-                label="V2A Scale", minimum=0.0, maximum=5.0, value=3.0, step=0.1
-            )
-    def on_image_upload(image, current_h, current_w):
-        if image is None:
-            return gr.update(), gr.update()
-        aspect = detect_aspect_ratio(image)
-        if aspect in RESOLUTIONS:
-            return (
-                gr.update(value=RESOLUTIONS[aspect]["width"]),
-                gr.update(value=RESOLUTIONS[aspect]["height"])
-            )
-        return gr.update(), gr.update()
-    input_image.change(
-        fn=on_image_upload,
-        inputs=[input_image, height, width],
-        outputs=[width, height],
-    )
-    generate_btn.click(
-        fn=generate_video,
-        inputs=[
-            prompt, negative_prompt, input_image, duration,
-            seed, randomize_seed, height, width, enhance_prompt,
-            video_cfg_scale, video_stg_scale, video_rescale_scale, video_a2v_scale,
-            audio_cfg_scale, audio_stg_scale, audio_rescale_scale, audio_v2a_scale,
-        ],
-        outputs=[output_video, seed],
-    )
-if __name__ == "__main__":
-    demo.queue().launch(
-        theme=gr.themes.Citrus(),
-        css=css,
-        mcp_server=True,
-    )