LTX-2-3-sync

Paused

App Files Files Community

linoyts HF Staff commited on Mar 10

Commit

aabdf9e

verified ·

1 Parent(s): fc8f41d

Update app.py

Browse files

Files changed (1) hide show

app.py +231 -9

app.py CHANGED Viewed

@@ -9,6 +9,9 @@ os.environ["TORCHDYNAMO_DISABLE"] = "1"
 # Install xformers for memory-efficient attention
 subprocess.run([sys.executable, "-m", "pip", "install", "xformers==0.0.32.post2", "--no-build-isolation"], check=False)
 # Clone LTX-2 repo and install packages
 LTX_REPO_URL = "https://github.com/Lightricks/LTX-2.git"
 LTX_REPO_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "LTX-2")
@@ -91,6 +94,152 @@ except Exception as e:
 logging.getLogger().setLevel(logging.INFO)
 # ─────────────────────────────────────────────────────────────────────────────
 # Helper: read reference downscale factor from IC-LoRA metadata
 # ─────────────────────────────────────────────────────────────────────────────
@@ -547,7 +696,7 @@ pipeline = LTX23UnifiedPipeline(
     distilled_checkpoint_path=checkpoint_path,
     spatial_upsampler_path=spatial_upsampler_path,
     gemma_root=gemma_root,
-    # ic_loras=ic_loras,
     quantization=QuantizationPolicy.fp8_cast(),
 )
@@ -680,6 +829,31 @@ def on_highres_toggle(image, video, high_res):
 # ─────────────────────────────────────────────────────────────────────────────
 # Generation
 # ─────────────────────────────────────────────────────────────────────────────
 @spaces.GPU(duration=180)
 @torch.inference_mode()
 def generate_video(
@@ -689,6 +863,7 @@ def generate_video(
     prompt: str,
     duration: float,
     conditioning_strength: float,
     enhance_prompt: bool,
     seed: int,
     randomize_seed: bool,
@@ -708,7 +883,7 @@ def generate_video(
         if input_image is not None:
             mode_parts.append("Image")
         if input_video is not None:
-            mode_parts.append("Video(IC-LoRA)")
         if input_audio is not None:
             mode_parts.append("Audio")
         if not mode_parts:
@@ -723,10 +898,40 @@ def generate_video(
         if input_image is not None:
             images = [ImageConditioningInput(path=str(input_image), frame_idx=0, strength=1.0)]
-        # Build video conditionings for IC-LoRA / V2V
         video_conditioning = None
         if input_video is not None:
-            video_conditioning = [(str(input_video), 1.0)]
         tiling_config = TilingConfig.default()
         video_chunks_number = get_video_chunks_number(num_frames, tiling_config)
@@ -783,14 +988,31 @@ with gr.Blocks(title="LTX-2.3 Unified: V2V + I2V + A2V") as demo:
                     label="🖼️ Input Image (I2V — first frame)",
                     type="filepath",
                 )
-                input_video = gr.Video(
-                    label="🎬 Reference Video (V2V — IC-LoRA)",
-                    sources=["upload"],
-                )
             input_audio = gr.Audio(
                 label="🔊 Input Audio (A2V — lipsync / BGM)",
                 type="filepath",
             )
             prompt = gr.Textbox(
                 label="Prompt",
@@ -849,7 +1071,7 @@ with gr.Blocks(title="LTX-2.3 Unified: V2V + I2V + A2V") as demo:
         fn=generate_video,
         inputs=[
             input_image, input_video, input_audio, prompt, duration,
-            conditioning_strength, enhance_prompt,
             seed, randomize_seed, height, width,
         ],
         outputs=[output_video, seed],

 # Install xformers for memory-efficient attention
 subprocess.run([sys.executable, "-m", "pip", "install", "xformers==0.0.32.post2", "--no-build-isolation"], check=False)
+# Install video preprocessing dependencies (pose/canny/depth extraction)
+subprocess.run([sys.executable, "-m", "pip", "install", "controlnet_aux", "imageio[ffmpeg]"], check=False)
 # Clone LTX-2 repo and install packages
 LTX_REPO_URL = "https://github.com/Lightricks/LTX-2.git"
 LTX_REPO_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "LTX-2")
 logging.getLogger().setLevel(logging.INFO)
+# ─────────────────────────────────────────────────────────────────────────────
+# Video Preprocessing: Strip appearance, keep structure
+# ─────────────────────────────────────────────────────────────────────────────
+import imageio
+import cv2
+from PIL import Image
+# Lazy-loaded processors (heavy models, only init when needed)
+_pose_processor = None
+_depth_processor = None
+def _get_pose_processor():
+    global _pose_processor
+    if _pose_processor is None:
+        from controlnet_aux import DWposeDetector
+        _pose_processor = DWposeDetector.from_pretrained_default()
+        print("[Preprocess] DWPose processor loaded")
+    return _pose_processor
+def _get_depth_processor():
+    global _depth_processor
+    if _depth_processor is None:
+        from controlnet_aux import MidasDetector
+        _depth_processor = MidasDetector.from_pretrained("lllyasviel/Annotators")
+        print("[Preprocess] MiDaS depth processor loaded")
+    return _depth_processor
+def load_video_frames(video_path: str) -> list[np.ndarray]:
+    """Load video frames as list of HWC uint8 numpy arrays."""
+    frames = []
+    with imageio.get_reader(video_path) as reader:
+        for frame in reader:
+            frames.append(frame)
+    return frames
+def write_video_mp4(frames_float_01: list[np.ndarray], fps: float, out_path: str) -> str:
+    """Write float [0,1] frames to mp4."""
+    frames_uint8 = [(f * 255).astype(np.uint8) for f in frames_float_01]
+    with imageio.get_writer(out_path, fps=fps, macro_block_size=1) as writer:
+        for fr in frames_uint8:
+            writer.append_data(fr)
+    return out_path
+def extract_first_frame(video_path: str) -> str:
+    """Extract first frame as a temp PNG file, return path."""
+    frames = load_video_frames(video_path)
+    if not frames:
+        raise ValueError("No frames in video")
+    out_path = tempfile.mktemp(suffix=".png")
+    Image.fromarray(frames[0]).save(out_path)
+    return out_path
+def preprocess_video_pose(frames: list[np.ndarray], width: int, height: int) -> list[np.ndarray]:
+    """Extract DWPose skeletons from each frame. Returns float [0,1] frames."""
+    processor = _get_pose_processor()
+    result = []
+    for frame in frames:
+        pil = Image.fromarray(frame.astype(np.uint8)).convert("RGB")
+        pose_img = processor(pil, include_body=True, include_hand=True, include_face=True)
+        if not isinstance(pose_img, Image.Image):
+            pose_img = Image.fromarray(pose_img.astype(np.uint8))
+        pose_img = pose_img.convert("RGB").resize((width, height), Image.BILINEAR)
+        result.append(np.array(pose_img).astype(np.float32) / 255.0)
+    return result
+def preprocess_video_canny(frames: list[np.ndarray], width: int, height: int,
+                           low_threshold: int = 50, high_threshold: int = 100) -> list[np.ndarray]:
+    """Extract Canny edges from each frame. Returns float [0,1] frames."""
+    result = []
+    for frame in frames:
+        # Resize first
+        resized = cv2.resize(frame, (width, height), interpolation=cv2.INTER_AREA)
+        gray = cv2.cvtColor(resized, cv2.COLOR_RGB2GRAY)
+        edges = cv2.Canny(gray, low_threshold, high_threshold)
+        # Convert single-channel to 3-channel
+        edges_3ch = np.stack([edges, edges, edges], axis=-1)
+        result.append(edges_3ch.astype(np.float32) / 255.0)
+    return result
+def preprocess_video_depth(frames: list[np.ndarray], width: int, height: int) -> list[np.ndarray]:
+    """Extract MiDaS depth maps from each frame. Returns float [0,1] frames."""
+    processor = _get_depth_processor()
+    detect_res = max(frames[0].shape[0], frames[0].shape[1])
+    image_res = max(width, height)
+    result = []
+    for frame in frames:
+        depth = processor(frame, detect_resolution=detect_res,
+                          image_resolution=image_res, output_type="np")
+        if depth.ndim == 2:
+            depth = np.stack([depth, depth, depth], axis=-1)
+        elif depth.shape[-1] == 1:
+            depth = np.repeat(depth, 3, axis=-1)
+        result.append(depth)
+    return result
+def preprocess_conditioning_video(
+    video_path: str,
+    mode: str,
+    width: int,
+    height: int,
+    num_frames: int,
+    fps: float,
+) -> tuple[str, str]:
+    """
+    Preprocess a video for conditioning. Strips appearance, keeps structure.
+    Returns:
+        (conditioning_mp4_path, first_frame_png_path)
+    """
+    frames = load_video_frames(video_path)
+    if not frames:
+        raise ValueError("No frames decoded from video")
+    # Trim to num_frames
+    frames = frames[:num_frames]
+    # Save first frame (original appearance) for image conditioning
+    first_png = tempfile.mktemp(suffix=".png")
+    Image.fromarray(frames[0]).save(first_png)
+    # Process based on mode
+    if mode == "Pose (DWPose)":
+        processed = preprocess_video_pose(frames, width, height)
+    elif mode == "Canny Edge":
+        processed = preprocess_video_canny(frames, width, height)
+    elif mode == "Depth (MiDaS)":
+        processed = preprocess_video_depth(frames, width, height)
+    else:
+        # "Raw" mode — no preprocessing
+        processed = [f.astype(np.float32) / 255.0 for f in frames]
+    cond_mp4 = tempfile.mktemp(suffix=".mp4")
+    write_video_mp4(processed, fps=fps, out_path=cond_mp4)
+    return cond_mp4, first_png
 # ─────────────────────────────────────────────────────────────────────────────
 # Helper: read reference downscale factor from IC-LoRA metadata
 # ─────────────────────────────────────────────────────────────────────────────
     distilled_checkpoint_path=checkpoint_path,
     spatial_upsampler_path=spatial_upsampler_path,
     gemma_root=gemma_root,
+    ic_loras=ic_loras,
     quantization=QuantizationPolicy.fp8_cast(),
 )
 # ─────────────────────────────────────────────────────────────────────────────
 # Generation
 # ─────────────────────────────────────────────────────────────────────────────
+def _extract_audio_from_video(video_path: str) -> str | None:
+    """Extract audio from video as a temp WAV file. Returns None if no audio."""
+    out_path = tempfile.mktemp(suffix=".wav")
+    try:
+        # Check if video has an audio stream
+        probe = subprocess.run(
+            ["ffprobe", "-v", "error", "-select_streams", "a:0",
+             "-show_entries", "stream=codec_type", "-of", "default=nw=1:nk=1",
+             video_path],
+            capture_output=True, text=True,
+        )
+        if not probe.stdout.strip():
+            return None
+        # Extract audio
+        subprocess.run(
+            ["ffmpeg", "-y", "-v", "error", "-i", video_path,
+             "-vn", "-ac", "1", "-ar", "48000", "-c:a", "pcm_s16le", out_path],
+            check=True,
+        )
+        return out_path
+    except (subprocess.CalledProcessError, FileNotFoundError):
+        return None
 @spaces.GPU(duration=180)
 @torch.inference_mode()
 def generate_video(
     prompt: str,
     duration: float,
     conditioning_strength: float,
+    video_preprocess: str,
     enhance_prompt: bool,
     seed: int,
     randomize_seed: bool,
         if input_image is not None:
             mode_parts.append("Image")
         if input_video is not None:
+            mode_parts.append(f"Video({video_preprocess})")
         if input_audio is not None:
             mode_parts.append("Audio")
         if not mode_parts:
         if input_image is not None:
             images = [ImageConditioningInput(path=str(input_image), frame_idx=0, strength=1.0)]
+        # Build video conditionings — preprocess to strip appearance
         video_conditioning = None
         if input_video is not None:
+            video_path = str(input_video)
+            if video_preprocess != "Raw (no preprocessing)":
+                print(f"[Preprocess] Running {video_preprocess} on input video...")
+                cond_mp4, first_frame_png = preprocess_conditioning_video(
+                    video_path=video_path,
+                    mode=video_preprocess,
+                    width=int(width) // 2,   # Stage 1 operates at half res
+                    height=int(height) // 2,
+                    num_frames=num_frames,
+                    fps=frame_rate,
+                )
+                video_conditioning = [(cond_mp4, 1.0)]
+                # If no image was provided, use the video's first frame
+                # (original appearance) as the image conditioning
+                if input_image is None:
+                    images = [ImageConditioningInput(
+                        path=first_frame_png, frame_idx=0, strength=1.0,
+                    )]
+                    print(f"[Preprocess] Using video first frame as image conditioning")
+            else:
+                # Raw mode — pass video as-is
+                video_conditioning = [(video_path, 1.0)]
+            # If no audio was provided, try to extract audio from the video
+            if input_audio is None:
+                extracted_audio = _extract_audio_from_video(video_path)
+                if extracted_audio is not None:
+                    input_audio = extracted_audio
+                    print(f"[Preprocess] Extracted audio from input video")
         tiling_config = TilingConfig.default()
         video_chunks_number = get_video_chunks_number(num_frames, tiling_config)
                     label="🖼️ Input Image (I2V — first frame)",
                     type="filepath",
                 )
+                with gr.Column():
+                    input_video = gr.Video(
+                        label="🎬 Reference Video (V2V)",
+                        sources=["upload"],
+                    )
+                    video_preprocess = gr.Dropdown(
+                        label="Video Preprocessing",
+                        choices=[
+                            "Pose (DWPose)",
+                            "Canny Edge",
+                            "Depth (MiDaS)",
+                            "Raw (no preprocessing)",
+                        ],
+                        value="Pose (DWPose)",
+                        info="Strips appearance from video → style comes from image/prompt instead",
+                    )
             input_audio = gr.Audio(
                 label="🔊 Input Audio (A2V — lipsync / BGM)",
                 type="filepath",
             )
+            gr.Markdown(
+                "*When a video is uploaded: its first frame auto-becomes the image input "
+                "(if none provided), and its audio track auto-becomes the audio input "
+                "(if none provided).*"
+            )
             prompt = gr.Textbox(
                 label="Prompt",
         fn=generate_video,
         inputs=[
             input_image, input_video, input_audio, prompt, duration,
+            conditioning_strength, video_preprocess, enhance_prompt,
             seed, randomize_seed, height, width,
         ],
         outputs=[output_video, seed],