TestingwithNeg

Sleeping

App Files Files Community

dagloop5 commited on Apr 18

Commit

bfed689

verified ·

1 Parent(s): 287d66d

Create app.py

Browse files

Files changed (1) hide show

app.py +611 -0

app.py ADDED Viewed

	@@ -0,0 +1,611 @@

+# =============================================================================
+# Installation and Setup
+# =============================================================================
+import os
+import subprocess
+import sys
+os.environ["TORCH_COMPILE_DISABLE"] = "1"
+os.environ["TORCHDYNAMO_DISABLE"] = "1"
+# Clone LTX-2 repo at the commit with ModelLedger-based HQ pipeline
+LTX_REPO_URL = "https://github.com/Lightricks/LTX-2.git"
+LTX_REPO_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "LTX-2")
+LTX_COMMIT_SHA = "ae855f8538843825f9015a419cf4ba5edaf5eec2"
+if not os.path.exists(LTX_REPO_DIR):
+    print(f"Cloning {LTX_REPO_URL} at commit {LTX_COMMIT_SHA}...")
+    os.makedirs(LTX_REPO_DIR)
+    subprocess.run(["git", "init", LTX_REPO_DIR], check=True)
+    subprocess.run(["git", "remote", "add", "origin", LTX_REPO_URL], cwd=LTX_REPO_DIR, check=True)
+    subprocess.run(["git", "fetch", "--depth", "1", "origin", LTX_COMMIT_SHA], cwd=LTX_REPO_DIR, check=True)
+    subprocess.run(["git", "checkout", LTX_COMMIT_SHA], cwd=LTX_REPO_DIR, check=True)
+sys.path.insert(0, os.path.join(LTX_REPO_DIR, "packages", "ltx-pipelines", "src"))
+sys.path.insert(0, os.path.join(LTX_REPO_DIR, "packages", "ltx-core", "src"))
+# =============================================================================
+# Imports
+# =============================================================================
+import logging
+import random
+import tempfile
+from pathlib import Path
+import torch
+torch._dynamo.config.suppress_errors = True
+torch._dynamo.config.disable = True
+import gradio as gr
+import spaces
+import numpy as np
+from huggingface_hub import hf_hub_download, snapshot_download
+# Core LTX imports from the ModelLedger-based HQ pipeline
+from ltx_core.model.video_vae import TilingConfig, get_video_chunks_number
+from ltx_core.quantization import QuantizationPolicy
+from ltx_core.loader import LoraPathStrengthAndSDOps, LTXV_LORA_COMFY_RENAMING_MAP
+from ltx_core.components.guiders import MultiModalGuiderParams
+# Import the HQ pipeline with ModelLedger
+from ltx_pipelines.ti2vid_two_stages_hq import TI2VidTwoStagesHQPipeline
+# Import constants
+from ltx_pipelines.utils.constants import LTX_2_3_HQ_PARAMS
+logging.getLogger().setLevel(logging.INFO)
+# =============================================================================
+# Constants
+# =============================================================================
+LTX_MODEL_REPO = "Lightricks/LTX-2.3"
+GEMMA_REPO = "Lightricks/gemma-3-12b-it-qat-q4_0-unquantized"
+DEFAULT_FRAME_RATE = 24.0
+MIN_DIM, MAX_DIM, STEP = 256, 1280, 64
+MIN_FRAMES, MAX_FRAMES = 9, 257
+MAX_SEED = np.iinfo(np.int32).max
+DEFAULT_PROMPT = (
+    "A majestic eagle soaring over mountain peaks at sunset, "
+    "wings spread wide against the orange sky, feathers catching the light, "
+    "wind currents visible in the motion blur, cinematic slow motion, 4K quality"
+)
+DEFAULT_NEGATIVE_PROMPT = (
+    "worst quality, inconsistent motion, blurry, jittery, distorted, "
+    "deformed, artifacts, text, watermark, logo, frame, border, "
+    "low resolution, pixelated, unnatural, fake, CGI, cartoon"
+)
+# =============================================================================
+# Model Download
+# =============================================================================
+print("=" * 80)
+print("Downloading LTX-2.3 models...")
+print("=" * 80)
+checkpoint_path = hf_hub_download(repo_id=LTX_MODEL_REPO, filename="ltx-2.3-22b-dev.safetensors")
+spatial_upsampler_path = hf_hub_download(repo_id=LTX_MODEL_REPO, filename="ltx-2.3-spatial-upscaler-x2-1.1.safetensors")
+distilled_lora_path = hf_hub_download(repo_id=LTX_MODEL_REPO, filename="ltx-2.3-22b-distilled-lora-384.safetensors")
+gemma_root = snapshot_download(repo_id=GEMMA_REPO)
+print(f"Dev checkpoint: {checkpoint_path}")
+print(f"Spatial upsampler: {spatial_upsampler_path}")
+print(f"Distilled LoRA: {distilled_lora_path}")
+print(f"Gemma root: {gemma_root}")
+print("=" * 80)
+# =============================================================================
+# Pipeline Initialization
+# =============================================================================
+print("Initializing TI2VidTwoStagesHQPipeline with ModelLedger...")
+# Create LoRA configuration for distilled model
+distilled_lora = [
+    LoraPathStrengthAndSDOps(
+        path=distilled_lora_path,
+        strength=1.0,  # Will be set per-stage (0.25, 0.5)
+        sd_ops=LTXV_LORA_COMFY_RENAMING_MAP,
+    )
+]
+# Initialize the ModelLedger-based HQ pipeline
+pipeline = TI2VidTwoStagesHQPipeline(
+    checkpoint_path=checkpoint_path,
+    distilled_lora=distilled_lora,
+    distilled_lora_strength_stage_1=0.25,  # From HQ params
+    distilled_lora_strength_stage_2=0.50,  # From HQ params
+    spatial_upsampler_path=spatial_upsampler_path,
+    gemma_root=gemma_root,
+    loras=(),  # No additional custom LoRAs
+    quantization=QuantizationPolicy.fp8_cast(),
+)
+print("Pipeline initialized successfully!")
+print("=" * 80)
+# =============================================================================
+# ZeroGPU Tensor Preloading - ModelLedger Pattern
+# =============================================================================
+# This pipeline has TWO model ledgers: stage_1_model_ledger and stage_2_model_ledger
+# We must preload both for ZeroGPU tensor packing to work
+print("Preloading all models for ZeroGPU tensor packing...")
+print("This may take a few minutes...")
+# ===== Stage 1 Model Ledger =====
+print("  Loading Stage 1 models...")
+# Stage 1 transformer
+ledger1 = pipeline.stage_1_model_ledger
+_transformer_s1 = ledger1.transformer()
+ledger1.transformer = lambda: _transformer_s1
+# Stage 1 video encoder
+_video_encoder_s1 = ledger1.video_encoder()
+ledger1.video_encoder = lambda: _video_encoder_s1
+# Stage 1 video decoder
+_video_decoder_s1 = ledger1.video_decoder()
+ledger1.video_decoder = lambda: _video_decoder_s1
+# Stage 1 audio decoder
+_audio_decoder_s1 = ledger1.audio_decoder()
+ledger1.audio_decoder = lambda: _audio_decoder_s1
+# Stage 1 vocoder
+_vocoder_s1 = ledger1.vocoder()
+ledger1.vocoder = lambda: _vocoder_s1
+# Stage 1 spatial upsampler
+_spatial_upsampler_s1 = ledger1.spatial_upsampler()
+ledger1.spatial_upsampler = lambda: _spatial_upsampler_s1
+# Stage 1 text encoder (Gemma)
+_text_encoder_s1 = ledger1.text_encoder()
+ledger1.text_encoder = lambda: _text_encoder_s1
+# Stage 1 embeddings processor
+_embeddings_processor_s1 = ledger1.embeddings_processor()
+ledger1.embeddings_processor = lambda: _embeddings_processor_s1
+print("    Stage 1 models loaded")
+# ===== Stage 2 Model Ledger =====
+print("  Loading Stage 2 models...")
+# Stage 2 transformer
+ledger2 = pipeline.stage_2_model_ledger
+_transformer_s2 = ledger2.transformer()
+ledger2.transformer = lambda: _transformer_s2
+# Stage 2 video encoder
+_video_encoder_s2 = ledger2.video_encoder()
+ledger2.video_encoder = lambda: _video_encoder_s2
+# Stage 2 video decoder
+_video_decoder_s2 = ledger2.video_decoder()
+ledger2.video_decoder = lambda: _video_decoder_s2
+# Stage 2 audio decoder
+_audio_decoder_s2 = ledger2.audio_decoder()
+ledger2.audio_decoder = lambda: _audio_decoder_s2
+# Stage 2 vocoder
+_vocoder_s2 = ledger2.vocoder()
+ledger2.vocoder = lambda: _vocoder_s2
+# Stage 2 spatial upsampler
+_spatial_upsampler_s2 = ledger2.spatial_upsampler()
+ledger2.spatial_upsampler = lambda: _spatial_upsampler_s2
+# Stage 2 text encoder (Gemma) - can reuse from stage 1
+_text_encoder_s2 = ledger2.text_encoder()
+ledger2.text_encoder = lambda: _text_encoder_s2
+# Stage 2 embeddings processor - can reuse from stage 1
+_embeddings_processor_s2 = ledger2.embeddings_processor()
+ledger2.embeddings_processor = lambda: _embeddings_processor_s2
+print("    Stage 2 models loaded")
+# Create global references to prevent garbage collection
+# Module-level variables persist for ZeroGPU to pack
+_transformer_stage1 = _transformer_s1
+_transformer_stage2 = _transformer_s2
+_video_encoder_stage1 = _video_encoder_s1
+_video_encoder_stage2 = _video_encoder_s2
+_video_decoder_stage1 = _video_decoder_s1
+_video_decoder_stage2 = _video_decoder_s2
+_audio_decoder_stage1 = _audio_decoder_s1
+_audio_decoder_stage2 = _audio_decoder_s2
+_vocoder_stage1 = _vocoder_s1
+_vocoder_stage2 = _vocoder_s2
+_spatial_upsampler_stage1 = _spatial_upsampler_s1
+_spatial_upsampler_stage2 = _spatial_upsampler_s2
+_text_encoder_stage1 = _text_encoder_s1
+_text_encoder_stage2 = _text_encoder_s2
+_embeddings_processor_stage1 = _embeddings_processor_s1
+_embeddings_processor_stage2 = _embeddings_processor_s2
+print("All models preloaded for ZeroGPU tensor packing!")
+print("=" * 80)
+# =============================================================================
+# Helper Functions
+# =============================================================================
+def log_memory(tag: str):
+    if torch.cuda.is_available():
+        allocated = torch.cuda.memory_allocated() / 1024**3
+        peak = torch.cuda.max_memory_allocated() / 1024**3
+        free, total = torch.cuda.mem_get_info()
+        print(f"[VRAM {tag}] allocated={allocated:.2f}GB peak={peak:.2f}GB free={free / 1024**3:.2f}GB total={total / 1024**3:.2f}GB")
+def calculate_frames(duration: float, frame_rate: float = DEFAULT_FRAME_RATE) -> int:
+    ideal_frames = int(duration * frame_rate)
+    ideal_frames = max(ideal_frames, MIN_FRAMES)
+    k = round((ideal_frames - 1) / 8)
+    frames = k * 8 + 1
+    return min(frames, MAX_FRAMES)
+def validate_resolution(height: int, width: int) -> tuple[int, int]:
+    height = round(height / STEP) * STEP
+    width = round(width / STEP) * STEP
+    height = max(MIN_DIM, min(height, MAX_DIM))
+    width = max(MIN_DIM, min(width, MAX_DIM))
+    return height, width
+def detect_aspect_ratio(image) -> str:
+    if image is None:
+        return "16:9"
+    if hasattr(image, "size"):
+        w, h = image.size
+    elif hasattr(image, "shape"):
+        h, w = image.shape[:2]
+    else:
+        return "16:9"
+    ratio = w / h
+    candidates = {"16:9": 16/9, "9:16": 9/16, "1:1": 1.0}
+    return min(candidates, key=lambda k: abs(ratio - candidates[k]))
+RESOLUTIONS = {
+    "16:9": {"width": 1280, "height": 704},
+    "9:16": {"width": 704, "height": 1280},
+    "1:1": {"width": 960, "height": 960},
+}
+def get_duration(
+    prompt: str,
+    negative_prompt: str,
+    input_image,
+    duration: float,
+    seed: int,
+    randomize_seed: bool,
+    height: int,
+    width: int,
+    enhance_prompt: bool,
+    video_cfg_scale: float,
+    video_stg_scale: float,
+    video_rescale_scale: float,
+    video_a2v_scale: float,
+    audio_cfg_scale: float,
+    audio_stg_scale: float,
+    audio_rescale_scale: float,
+    audio_v2a_scale: float,
+    progress,
+) -> int:
+    base = 60
+    if duration > 4:
+        base += 15
+    if duration > 6:
+        base += 15
+    if height > 700 or width > 1000:
+        base += 15
+    frames_from_duration = int(duration * DEFAULT_FRAME_RATE)
+    if frames_from_duration > 81:
+        base += 10
+    return min(base, 90)
+@spaces.GPU(duration=get_duration)
+@torch.inference_mode()
+def generate_video(
+    prompt: str,
+    negative_prompt: str,
+    input_image,
+    duration: float,
+    seed: int,
+    randomize_seed: bool,
+    height: int,
+    width: int,
+    enhance_prompt: bool,
+    video_cfg_scale: float,
+    video_stg_scale: float,
+    video_rescale_scale: float,
+    video_a2v_scale: float,
+    audio_cfg_scale: float,
+    audio_stg_scale: float,
+    audio_rescale_scale: float,
+    audio_v2a_scale: float,
+    progress=gr.Progress(track_tqdm=True),
+):
+    """
+    Generate high-quality video using the ModelLedger-based HQ pipeline.
+    This pipeline uses ModelLedger (like DistilledPipeline) for ZeroGPU compatibility,
+    while supporting CFG/negative prompts (like original TI2VidTwoStagesHQPipeline).
+    """
+    try:
+        torch.cuda.reset_peak_memory_stats()
+        log_memory("start")
+        current_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed)
+        print(f"Using seed: {current_seed}")
+        height, width = validate_resolution(int(height), int(width))
+        print(f"Resolution: {width}x{height}")
+        num_frames = calculate_frames(duration, DEFAULT_FRAME_RATE)
+        print(f"Frames: {num_frames} ({duration}s @ {DEFAULT_FRAME_RATE}fps)")
+        # Prepare image conditioning if provided
+        images = []
+        if input_image is not None:
+            output_dir = Path("outputs")
+            output_dir.mkdir(exist_ok=True)
+            temp_image_path = output_dir / f"temp_input_{current_seed}.jpg"
+            if hasattr(input_image, "save"):
+                input_image.save(temp_image_path)
+            else:
+                import shutil
+                shutil.copy(input_image, temp_image_path)
+            # Use ImageConditioningInput format
+            images = [(str(temp_image_path), 1.0)]  # (path, strength)
+        tiling_config = TilingConfig.default()
+        video_chunks_number = get_video_chunks_number(num_frames, tiling_config)
+        # Configure MultiModalGuider parameters
+        video_guider_params = MultiModalGuiderParams(
+            cfg_scale=video_cfg_scale,
+            stg_scale=video_stg_scale,
+            rescale_scale=video_rescale_scale,
+            modality_scale=video_a2v_scale,
+            skip_step=0,
+            stg_blocks=[],
+        )
+        audio_guider_params = MultiModalGuiderParams(
+            cfg_scale=audio_cfg_scale,
+            stg_scale=audio_stg_scale,
+            rescale_scale=audio_rescale_scale,
+            modality_scale=audio_v2a_scale,
+            skip_step=0,
+            stg_blocks=[],
+        )
+        log_memory("before pipeline call")
+        # Call the pipeline
+        video, audio = pipeline(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            seed=current_seed,
+            height=height,
+            width=width,
+            num_frames=num_frames,
+            frame_rate=DEFAULT_FRAME_RATE,
+            num_inference_steps=LTX_2_3_HQ_PARAMS.num_inference_steps,
+            video_guider_params=video_guider_params,
+            audio_guider_params=audio_guider_params,
+            images=images,
+            tiling_config=tiling_config,
+            enhance_prompt=enhance_prompt,
+        )
+        log_memory("after pipeline call")
+        # Encode video with audio
+        from ltx_pipelines.utils.media_io import encode_video
+        output_path = tempfile.mktemp(suffix=".mp4")
+        encode_video(
+            video=video,
+            fps=DEFAULT_FRAME_RATE,
+            audio=audio,
+            output_path=output_path,
+            video_chunks_number=video_chunks_number,
+        )
+        log_memory("after encode_video")
+        return str(output_path), current_seed
+    except Exception as e:
+        import traceback
+        log_memory("on error")
+        print(f"Error: {str(e)}\n{traceback.format_exc()}")
+        return None, current_seed
+# =============================================================================
+# Gradio UI
+# =============================================================================
+css = """
+.fillable {max-width: 1200px !important}
+.progress-text {color: white}
+"""
+with gr.Blocks(title="LTX-2.3 Two-Stage HQ Video Generation") as demo:
+    gr.Markdown("# LTX-2.3 Two-Stage HQ Video Generation")
+    gr.Markdown(
+        "High-quality text/image-to-video generation using ModelLedger. "
+        "[[Model]](https://huggingface.co/Lightricks/LTX-2.3) "
+        "[[GitHub]](https://github.com/Lightricks/LTX-2)"
+    )
+    with gr.Row():
+        with gr.Column():
+            input_image = gr.Image(
+                label="Input Image (Optional - for image-to-video)",
+                type="pil",
+                sources=["upload", "webcam", "clipboard"]
+            )
+            prompt = gr.Textbox(
+                label="Prompt",
+                info="Describe the video you want to generate",
+                value=DEFAULT_PROMPT,
+                lines=3,
+            )
+            negative_prompt = gr.Textbox(
+                label="Negative Prompt",
+                info="What to avoid in the generated video",
+                value=DEFAULT_NEGATIVE_PROMPT,
+                lines=2,
+            )
+            duration = gr.Slider(
+                label="Duration (seconds)",
+                minimum=0.5,
+                maximum=8.0,
+                value=2.0,
+                step=0.1,
+            )
+            enhance_prompt = gr.Checkbox(
+                label="Enhance Prompt",
+                value=False,
+            )
+            generate_btn = gr.Button("Generate Video", variant="primary", size="lg")
+        with gr.Column():
+            output_video = gr.Video(
+                label="Generated Video",
+                autoplay=True,
+                interactive=False
+            )
+    with gr.Accordion("Advanced Settings", open=False):
+        with gr.Row():
+            width = gr.Number(label="Width", value=1280, precision=0)
+            height = gr.Number(label="Height", value=704, precision=0)
+        with gr.Row():
+            seed = gr.Number(label="Seed", value=42, precision=0, minimum=0, maximum=MAX_SEED)
+            randomize_seed = gr.Checkbox(label="Randomize Seed", value=True)
+        gr.Markdown("### Video Guidance Parameters")
+        gr.Markdown("Control how strongly the model follows the video prompt.")
+        with gr.Row():
+            video_cfg_scale = gr.Slider(
+                label="Video CFG Scale",
+                minimum=1.0,
+                maximum=10.0,
+                value=LTX_2_3_HQ_PARAMS.video_guider_params.cfg_scale,
+                step=0.1,
+            )
+            video_stg_scale = gr.Slider(
+                label="Video STG Scale",
+                minimum=0.0,
+                maximum=2.0,
+                value=0.0,
+                step=0.1,
+            )
+        with gr.Row():
+            video_rescale_scale = gr.Slider(
+                label="Video Rescale",
+                minimum=0.0,
+                maximum=2.0,
+                value=0.45,
+                step=0.1,
+            )
+            video_a2v_scale = gr.Slider(
+                label="A2V Scale",
+                minimum=0.0,
+                maximum=5.0,
+                value=3.0,
+                step=0.1,
+            )
+        gr.Markdown("### Audio Guidance Parameters")
+        gr.Markdown("Control audio generation quality and sync.")
+        with gr.Row():
+            audio_cfg_scale = gr.Slider(
+                label="Audio CFG Scale",
+                minimum=1.0,
+                maximum=15.0,
+                value=LTX_2_3_HQ_PARAMS.audio_guider_params.cfg_scale,
+                step=0.1,
+            )
+            audio_stg_scale = gr.Slider(
+                label="Audio STG Scale",
+                minimum=0.0,
+                maximum=2.0,
+                value=0.0,
+                step=0.1,
+            )
+        with gr.Row():
+            audio_rescale_scale = gr.Slider(
+                label="Audio Rescale",
+                minimum=0.0,
+                maximum=2.0,
+                value=1.0,
+                step=0.1,
+            )
+            audio_v2a_scale = gr.Slider(
+                label="V2A Scale",
+                minimum=0.0,
+                maximum=5.0,
+                value=3.0,
+                step=0.1,
+            )
+    def on_image_upload(image, current_h, current_w):
+        if image is None:
+            return gr.update(), gr.update()
+        aspect = detect_aspect_ratio(image)
+        if aspect in RESOLUTIONS:
+            return (
+                gr.update(value=RESOLUTIONS[aspect]["width"]),
+                gr.update(value=RESOLUTIONS[aspect]["height"])
+            )
+        return gr.update(), gr.update()
+    input_image.change(
+        fn=on_image_upload,
+        inputs=[input_image, height, width],
+        outputs=[width, height],
+    )
+    generate_btn.click(
+        fn=generate_video,
+        inputs=[
+            prompt, negative_prompt, input_image, duration,
+            seed, randomize_seed, height, width, enhance_prompt,
+            video_cfg_scale, video_stg_scale, video_rescale_scale, video_a2v_scale,
+            audio_cfg_scale, audio_stg_scale, audio_rescale_scale, audio_v2a_scale,
+        ],
+        outputs=[output_video, seed],
+    )
+if __name__ == "__main__":
+    demo.queue().launch(
+        theme=gr.themes.Citrus(),
+        css=css,
+        mcp_server=True,
+    )