LTX-2-3-sync

Paused

App Files Files Community

linoyts HF Staff commited on Mar 11

Commit

b4fa358

verified ·

1 Parent(s): bd479f8

Update app.py

Browse files

Files changed (1) hide show

app.py +27 -27

app.py CHANGED Viewed

@@ -9,8 +9,9 @@ os.environ["TORCHDYNAMO_DISABLE"] = "1"
 # Install xformers for memory-efficient attention
 subprocess.run([sys.executable, "-m", "pip", "install", "xformers==0.0.32.post2", "--no-build-isolation"], check=False)
-# Install video preprocessing dependencies (pose/canny/depth extraction)
-subprocess.run([sys.executable, "-m", "pip", "install", "controlnet_aux", "imageio[ffmpeg]"], check=False)
 # Reinstall torchaudio to match the torch CUDA version on this space.
 # controlnet_aux or other deps can pull in a CPU-only torchaudio that conflicts
@@ -126,18 +127,18 @@ _depth_processor = None
 def _get_pose_processor():
     global _pose_processor
     if _pose_processor is None:
-        from controlnet_aux import OpenposeDetector
-        _pose_processor = OpenposeDetector.from_pretrained("lllyasviel/Annotators")
-        print("[Preprocess] OpenPose processor loaded")
     return _pose_processor
 def _get_depth_processor():
     global _depth_processor
     if _depth_processor is None:
-        from controlnet_aux import MidasDetector
-        _depth_processor = MidasDetector.from_pretrained("lllyasviel/Annotators")
-        print("[Preprocess] MiDaS depth processor loaded")
     return _depth_processor
@@ -170,12 +171,12 @@ def extract_first_frame(video_path: str) -> str:
 def preprocess_video_pose(frames: list[np.ndarray], width: int, height: int) -> list[np.ndarray]:
-    """Extract OpenPose skeletons from each frame. Returns float [0,1] frames."""
     processor = _get_pose_processor()
     result = []
     for frame in frames:
         pil = Image.fromarray(frame.astype(np.uint8)).convert("RGB")
-        pose_img = processor(pil, hand_and_face=False)
         if not isinstance(pose_img, Image.Image):
             pose_img = Image.fromarray(np.array(pose_img).astype(np.uint8))
         pose_img = pose_img.convert("RGB").resize((width, height), Image.BILINEAR)
@@ -199,19 +200,18 @@ def preprocess_video_canny(frames: list[np.ndarray], width: int, height: int,
 def preprocess_video_depth(frames: list[np.ndarray], width: int, height: int) -> list[np.ndarray]:
-    """Extract MiDaS depth maps from each frame. Returns float [0,1] frames."""
-    processor = _get_depth_processor()
-    detect_res = max(frames[0].shape[0], frames[0].shape[1])
-    image_res = max(width, height)
     result = []
     for frame in frames:
-        depth = processor(frame, detect_resolution=detect_res,
-                          image_resolution=image_res, output_type="np")
-        if depth.ndim == 2:
-            depth = np.stack([depth, depth, depth], axis=-1)
-        elif depth.shape[-1] == 1:
-            depth = np.repeat(depth, 3, axis=-1)
-        result.append(depth)
     return result
@@ -241,11 +241,11 @@ def preprocess_conditioning_video(
     Image.fromarray(frames[0]).save(first_png)
     # Process based on mode
-    if mode == "Pose (OpenPose)":
         processed = preprocess_video_pose(frames, width, height)
     elif mode == "Canny Edge":
         processed = preprocess_video_canny(frames, width, height)
-    elif mode == "Depth (MiDaS)":
         processed = preprocess_video_depth(frames, width, height)
     else:
         # "Raw" mode — no preprocessing
@@ -713,7 +713,7 @@ pipeline = LTX23UnifiedPipeline(
     distilled_checkpoint_path=checkpoint_path,
     spatial_upsampler_path=spatial_upsampler_path,
     gemma_root=gemma_root,
-    # ic_loras=ic_loras,
     quantization=QuantizationPolicy.fp8_cast(),
 )
@@ -1013,12 +1013,12 @@ with gr.Blocks(title="LTX-2.3 Unified: V2V + I2V + A2V") as demo:
                     video_preprocess = gr.Dropdown(
                         label="Video Preprocessing",
                         choices=[
-                            "Pose (OpenPose)",
                             "Canny Edge",
-                            "Depth (MiDaS)",
                             "Raw (no preprocessing)",
                         ],
-                        value="Pose (OpenPose)",
                         info="Strips appearance from video → style comes from image/prompt instead",
                     )
             input_audio = gr.Audio(

 # Install xformers for memory-efficient attention
 subprocess.run([sys.executable, "-m", "pip", "install", "xformers==0.0.32.post2", "--no-build-isolation"], check=False)
+# Install video preprocessing dependencies
+subprocess.run([sys.executable, "-m", "pip", "install",
+                "dwpose", "onnxruntime-gpu", "imageio[ffmpeg]", "scikit-image"], check=False)
 # Reinstall torchaudio to match the torch CUDA version on this space.
 # controlnet_aux or other deps can pull in a CPU-only torchaudio that conflicts
 def _get_pose_processor():
     global _pose_processor
     if _pose_processor is None:
+        from dwpose import DwposeDetector
+        _pose_processor = DwposeDetector.from_pretrained_default()
+        print("[Preprocess] DWPose processor loaded")
     return _pose_processor
 def _get_depth_processor():
+    """Placeholder — uses simple Laplacian edge-based depth approximation via OpenCV."""
     global _depth_processor
     if _depth_processor is None:
+        _depth_processor = "cv2"  # sentinel — we use cv2 directly
+        print("[Preprocess] CV2-based depth processor loaded")
     return _depth_processor
 def preprocess_video_pose(frames: list[np.ndarray], width: int, height: int) -> list[np.ndarray]:
+    """Extract DWPose skeletons from each frame. Returns float [0,1] frames."""
     processor = _get_pose_processor()
     result = []
     for frame in frames:
         pil = Image.fromarray(frame.astype(np.uint8)).convert("RGB")
+        pose_img = processor(pil, include_body=True, include_hand=True, include_face=True)
         if not isinstance(pose_img, Image.Image):
             pose_img = Image.fromarray(np.array(pose_img).astype(np.uint8))
         pose_img = pose_img.convert("RGB").resize((width, height), Image.BILINEAR)
 def preprocess_video_depth(frames: list[np.ndarray], width: int, height: int) -> list[np.ndarray]:
+    """Estimate depth-like maps from each frame using Laplacian gradient magnitude.
+    This is a fast approximation — for true depth, use MiDaS externally."""
     result = []
     for frame in frames:
+        resized = cv2.resize(frame, (width, height), interpolation=cv2.INTER_AREA)
+        gray = cv2.cvtColor(resized, cv2.COLOR_RGB2GRAY).astype(np.float32)
+        # Laplacian gives edge/gradient info that approximates depth discontinuities
+        lap = np.abs(cv2.Laplacian(gray, cv2.CV_32F, ksize=5))
+        # Normalize to [0, 1]
+        lap = lap / (lap.max() + 1e-8)
+        depth_3ch = np.stack([lap, lap, lap], axis=-1)
+        result.append(depth_3ch)
     return result
     Image.fromarray(frames[0]).save(first_png)
     # Process based on mode
+    if mode == "Pose (DWPose)":
         processed = preprocess_video_pose(frames, width, height)
     elif mode == "Canny Edge":
         processed = preprocess_video_canny(frames, width, height)
+    elif mode == "Depth (Laplacian)":
         processed = preprocess_video_depth(frames, width, height)
     else:
         # "Raw" mode — no preprocessing
     distilled_checkpoint_path=checkpoint_path,
     spatial_upsampler_path=spatial_upsampler_path,
     gemma_root=gemma_root,
+    ic_loras=ic_loras,
     quantization=QuantizationPolicy.fp8_cast(),
 )
                     video_preprocess = gr.Dropdown(
                         label="Video Preprocessing",
                         choices=[
+                            "Pose (DWPose)",
                             "Canny Edge",
+                            "Depth (Laplacian)",
                             "Raw (no preprocessing)",
                         ],
+                        value="Pose (DWPose)",
                         info="Strips appearance from video → style comes from image/prompt instead",
                     )
             input_audio = gr.Audio(